1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * Directory manipulation routines. 44 * 45 * When manipulating directories, the i_rwlock provides serialization 46 * since directories cannot be mmapped. The i_contents lock is redundant. 47 */ 48 49 #include <sys/types.h> 50 #include <sys/t_lock.h> 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/signal.h> 54 #include <sys/cred.h> 55 #include <sys/proc.h> 56 #include <sys/disp.h> 57 #include <sys/user.h> 58 #include <sys/vfs.h> 59 #include <sys/vnode.h> 60 #include <sys/stat.h> 61 #include <sys/mode.h> 62 #include <sys/buf.h> 63 #include <sys/uio.h> 64 #include <sys/dnlc.h> 65 #include <sys/fs/ufs_inode.h> 66 #include <sys/fs/ufs_fs.h> 67 #include <sys/mount.h> 68 #include <sys/fs/ufs_fsdir.h> 69 #include <sys/fs/ufs_trans.h> 70 #include <sys/fs/ufs_panic.h> 71 #include <sys/fs/ufs_quota.h> 72 #include <sys/errno.h> 73 #include <sys/debug.h> 74 #include <vm/seg.h> 75 #include <sys/sysmacros.h> 76 #include <sys/cmn_err.h> 77 #include <sys/cpuvar.h> 78 #include <sys/unistd.h> 79 #include <sys/policy.h> 80 81 /* 82 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ 83 */ 84 #if !ISP2(DIRBLKSIZ) 85 #error "DIRBLKSIZ not a power of 2" 86 #endif 87 88 /* 89 * A virgin directory. 90 */ 91 static struct dirtemplate mastertemplate = { 92 0, 12, 1, ".", 93 0, DIRBLKSIZ - 12, 2, ".." 94 }; 95 96 #define LDIRSIZ(len) \ 97 ((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3)) 98 #define MAX_DIR_NAME_LEN(len) \ 99 (((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1) 100 101 /* 102 * The dnlc directory cache allows a 64 bit handle for directory entries. 103 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset 104 * into the handle. Note, a 32 bit offset allows a 4GB directory, which 105 * is way beyond what could be cached in memory by the directory 106 * caching routines. So we are quite safe with this limit. 107 * The macros below pack and unpack the handle. 108 */ 109 #define H_TO_INO(h) (uint32_t)((h) & UINT_MAX) 110 #define H_TO_OFF(h) (off_t)((h) >> 32) 111 #define INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino)) 112 113 /* 114 * The average size of a typical on disk directory entry is about 16 bytes 115 * and so defines AV_DIRECT_SHIFT : log2(16) 116 * This define is only used to approximate the number of entries 117 * is a directory. This is needed for dnlc_dir_start() which will immediately 118 * return an error if the value is not within its acceptable range of 119 * number of files in a directory. 120 */ 121 #define AV_DIRECT_SHIFT 4 122 /* 123 * If the directory size (from i_size) is greater than the ufs_min_dir_cache 124 * tunable then we request dnlc directory caching. 125 * This has found to be profitable after 1024 file names. 126 */ 127 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT; 128 129 /* The time point the dnlc directory caching was disabled */ 130 static hrtime_t ufs_dc_disable_at; 131 /* directory caching disable duration */ 132 static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5; 133 134 #ifdef DEBUG 135 int dirchk = 1; 136 #else /* !DEBUG */ 137 int dirchk = 0; 138 #endif /* DEBUG */ 139 int ufs_negative_cache = 1; 140 uint64_t ufs_dirremove_retry_cnt; 141 142 static void dirbad(); 143 static int ufs_dirrename(); 144 static int ufs_diraddentry(); 145 static int ufs_dirempty(); 146 static int ufs_dirscan(); 147 static int ufs_dirclrdotdot(); 148 static int ufs_dirfixdotdot(); 149 static int ufs_dirpurgedotdot(); 150 static int dirprepareentry(); 151 static int ufs_dirmakedirect(); 152 static int dirbadname(); 153 static int dirmangled(); 154 155 /* 156 * Look for a given name in a directory. On successful return, *ipp 157 * will point to the VN_HELD inode. 158 */ 159 int 160 ufs_dirlook( 161 struct inode *dp, 162 char *namep, 163 struct inode **ipp, 164 struct cred *cr, 165 int skipdnlc) /* skip the 1st level dnlc */ 166 { 167 uint64_t handle; 168 struct fbuf *fbp; /* a buffer of directory entries */ 169 struct direct *ep; /* the current directory entry */ 170 struct vnode *vp; 171 struct vnode *dvp; /* directory vnode ptr */ 172 struct ulockfs *ulp; 173 dcanchor_t *dcap; 174 off_t endsearch; /* offset to end directory search */ 175 off_t offset; 176 off_t start_off; /* starting offset from middle search */ 177 off_t last_offset; /* last offset */ 178 int entryoffsetinblock; /* offset of ep in addr's buffer */ 179 int numdirpasses; /* strategy for directory search */ 180 int namlen; /* length of name */ 181 int err; 182 int doingchk; 183 int i; 184 int caching; 185 int indeadlock; 186 ino_t ep_ino; /* entry i number */ 187 ino_t chkino; 188 ushort_t ep_reclen; /* direct local d_reclen */ 189 190 ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */ 191 192 if (dp->i_ufsvfs) 193 ulp = &dp->i_ufsvfs->vfs_ulockfs; 194 /* 195 * Check accessibility of directory. 196 */ 197 if (((dp->i_mode & IFMT) != IFDIR) && 198 ((dp->i_mode & IFMT) != IFATTRDIR)) 199 return (ENOTDIR); 200 201 if (err = ufs_iaccess(dp, IEXEC, cr)) 202 return (err); 203 204 /* 205 * Check the directory name lookup cache, first for individual files 206 * then for complete directories. 207 */ 208 dvp = ITOV(dp); 209 if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) { 210 /* vp is already held from dnlc_lookup */ 211 if (vp == DNLC_NO_VNODE) { 212 VN_RELE(vp); 213 return (ENOENT); 214 } 215 *ipp = VTOI(vp); 216 return (0); 217 } 218 219 dcap = &dp->i_danchor; 220 221 /* 222 * Grab the reader lock on the directory data before checking 223 * the dnlc to avoid a race with ufs_dirremove() & friends. 224 * 225 * ufs_tryirwlock uses rw_tryenter and checks for SLOCK to 226 * avoid i_rwlock, ufs_lockfs_begin deadlock. If deadlock 227 * possible, retries the operation. 228 */ 229 ufs_tryirwlock((&dp->i_rwlock), RW_READER, retry_dircache); 230 if (indeadlock) 231 return (EAGAIN); 232 233 switch (dnlc_dir_lookup(dcap, namep, &handle)) { 234 case DFOUND: 235 ep_ino = (ino_t)H_TO_INO(handle); 236 if (dp->i_number == ep_ino) { 237 VN_HOLD(dvp); /* want ourself, "." */ 238 *ipp = dp; 239 rw_exit(&dp->i_rwlock); 240 return (0); 241 } 242 if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) { 243 uint64_t handle2; 244 /* 245 * release the lock on the dir we are searching 246 * to avoid a deadlock when grabbing the 247 * i_contents lock in ufs_iget_alloced(). 248 */ 249 rw_exit(&dp->i_rwlock); 250 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 251 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 252 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 253 /* 254 * must recheck as we dropped dp->i_rwlock 255 */ 256 ufs_tryirwlock(&dp->i_rwlock, RW_READER, retry_parent); 257 if (indeadlock) { 258 if (!err) 259 VN_RELE(ITOV(*ipp)); 260 return (EAGAIN); 261 } 262 if (!err && (dnlc_dir_lookup(dcap, namep, &handle2) 263 == DFOUND) && (handle == handle2)) { 264 dnlc_update(dvp, namep, ITOV(*ipp)); 265 rw_exit(&dp->i_rwlock); 266 return (0); 267 } 268 /* check failed, read the actual directory */ 269 if (!err) { 270 VN_RELE(ITOV(*ipp)); 271 } 272 goto restart; 273 } 274 /* usual case of not "." nor ".." */ 275 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 276 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 277 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 278 if (err) { 279 rw_exit(&dp->i_rwlock); 280 return (err); 281 } 282 dnlc_update(dvp, namep, ITOV(*ipp)); 283 rw_exit(&dp->i_rwlock); 284 return (0); 285 case DNOENT: 286 if (ufs_negative_cache && (dp->i_nlink > 0)) { 287 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 288 } 289 rw_exit(&dp->i_rwlock); 290 return (ENOENT); 291 default: 292 break; 293 } 294 restart: 295 296 fbp = NULL; 297 doingchk = 0; 298 chkino = 0; 299 caching = 0; 300 301 /* 302 * Attempt to cache any directories greater than the tunable 303 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM), 304 * disable caching for this directory and record the system time. 305 * Any attempt after the disable time has expired will enable 306 * the caching again. 307 */ 308 if (dp->i_size >= ufs_min_dir_cache) { 309 /* 310 * if the directory caching disable time has expired 311 * enable the caching again. 312 */ 313 if (dp->i_cachedir == CD_DISABLED_NOMEM && 314 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { 315 ufs_dc_disable_at = 0; 316 dp->i_cachedir = CD_ENABLED; 317 } 318 if (dp->i_cachedir == CD_ENABLED) { 319 switch (dnlc_dir_start(dcap, dp->i_size >> 320 AV_DIRECT_SHIFT)) { 321 case DNOMEM: 322 dp->i_cachedir = CD_DISABLED_NOMEM; 323 ufs_dc_disable_at = gethrtime(); 324 break; 325 case DTOOBIG: 326 dp->i_cachedir = CD_DISABLED_TOOBIG; 327 break; 328 case DOK: 329 caching = 1; 330 break; 331 default: 332 break; 333 } 334 } 335 } 336 /* 337 * If caching we don't stop when the file has been 338 * found, but need to know later, so clear *ipp now 339 */ 340 *ipp = NULL; 341 342 recheck: 343 if (caching) { 344 offset = 0; 345 entryoffsetinblock = 0; 346 numdirpasses = 1; 347 } else { 348 /* 349 * Take care to look at dp->i_diroff only once, as it 350 * may be changing due to other threads/cpus. 351 */ 352 offset = dp->i_diroff; 353 if (offset > dp->i_size) { 354 offset = 0; 355 } 356 if (offset == 0) { 357 entryoffsetinblock = 0; 358 numdirpasses = 1; 359 } else { 360 start_off = offset; 361 362 entryoffsetinblock = blkoff(dp->i_fs, offset); 363 if (entryoffsetinblock != 0) { 364 err = blkatoff(dp, offset, (char **)0, &fbp); 365 if (err) 366 goto bad; 367 } 368 numdirpasses = 2; 369 } 370 } 371 endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t); 372 namlen = strlen(namep); 373 last_offset = 0; 374 375 searchloop: 376 while (offset < endsearch) { 377 /* 378 * If offset is on a block boundary, 379 * read the next directory block. 380 * Release previous if it exists. 381 */ 382 if (blkoff(dp->i_fs, offset) == 0) { 383 if (fbp != NULL) { 384 fbrelse(fbp, S_OTHER); 385 } 386 err = blkatoff(dp, offset, (char **)0, &fbp); 387 if (err) 388 goto bad; 389 entryoffsetinblock = 0; 390 } 391 392 /* 393 * If the offset to the next entry is invalid or if the 394 * next entry is a zero length record or if the record 395 * length is invalid, then skip to the next directory 396 * block. Complete validation checks are done if the 397 * record length is invalid. 398 * 399 * Full validation checks are slow so they are disabled 400 * by default. Complete checks can be run by patching 401 * "dirchk" to be true. 402 * 403 * We have to check the validity of entryoffsetinblock 404 * here because it can be set to i_diroff above. 405 */ 406 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock); 407 if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 || 408 (dirchk || (ep->d_reclen & 0x3)) && 409 dirmangled(dp, ep, entryoffsetinblock, offset)) { 410 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 411 offset += i; 412 entryoffsetinblock += i; 413 if (caching) { 414 dnlc_dir_purge(dcap); 415 caching = 0; 416 } 417 continue; 418 } 419 420 ep_reclen = ep->d_reclen; 421 422 /* 423 * Add named entries and free space into the directory cache 424 */ 425 if (caching) { 426 ushort_t extra; 427 off_t off2; 428 429 if (ep->d_ino == 0) { 430 extra = ep_reclen; 431 if (offset & (DIRBLKSIZ - 1)) { 432 dnlc_dir_purge(dcap); 433 dp->i_cachedir = CD_DISABLED; 434 caching = 0; 435 } 436 } else { 437 /* 438 * entries hold the previous offset except the 439 * 1st which holds the offset + 1 440 */ 441 if (offset & (DIRBLKSIZ - 1)) { 442 off2 = last_offset; 443 } else { 444 off2 = offset + 1; 445 } 446 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 447 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 448 extra = ep_reclen - DIRSIZ(ep); 449 } 450 if (caching && (extra >= LDIRSIZ(1))) { 451 caching = (dnlc_dir_add_space(dcap, extra, 452 (uint64_t)offset) == DOK); 453 } 454 } 455 456 /* 457 * Check for a name match. 458 * We have the parent inode read locked with i_rwlock. 459 */ 460 if (ep->d_ino && ep->d_namlen == namlen && 461 *namep == *ep->d_name && /* fast chk 1st chr */ 462 bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) { 463 464 /* 465 * We have to release the fbp early here to avoid 466 * a possible deadlock situation where we have the 467 * fbp and want the directory inode and someone doing 468 * a ufs_direnter_* has the directory inode and wants 469 * the fbp. XXX - is this still needed? 470 */ 471 ep_ino = (ino_t)ep->d_ino; 472 ASSERT(fbp != NULL); 473 fbrelse(fbp, S_OTHER); 474 fbp = NULL; 475 476 /* 477 * Atomic update (read lock held) 478 */ 479 dp->i_diroff = offset; 480 481 if (namlen == 2 && namep[0] == '.' && namep[1] == '.') { 482 struct timeval32 omtime; 483 484 if (caching) { 485 dnlc_dir_purge(dcap); 486 caching = 0; 487 } 488 if (doingchk) { 489 /* 490 * if the inumber didn't change 491 * continue with already found inode. 492 */ 493 if (ep_ino == chkino) 494 goto checkok; 495 else { 496 VN_RELE(ITOV(*ipp)); 497 /* *ipp is nulled at restart */ 498 goto restart; 499 } 500 } 501 /* 502 * release the lock on the dir we are searching 503 * to avoid a deadlock when grabbing the 504 * i_contents lock in ufs_iget_alloced(). 505 */ 506 omtime = dp->i_mtime; 507 rw_exit(&dp->i_rwlock); 508 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 509 RW_READER); 510 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 511 cr); 512 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 513 ufs_tryirwlock(&dp->i_rwlock, RW_READER, 514 retry_disk); 515 if (indeadlock) { 516 if (!err) 517 VN_RELE(ITOV(*ipp)); 518 return (EAGAIN); 519 } 520 if (err) 521 goto bad; 522 /* 523 * Since we released the lock on the directory, 524 * we must check that the same inode is still 525 * the ".." entry for this directory. 526 */ 527 /*CSTYLED*/ 528 if (timercmp(&omtime, &dp->i_mtime, !=)) { 529 /* 530 * Modification time changed on the 531 * directory, we must go check if 532 * the inumber changed for ".." 533 */ 534 doingchk = 1; 535 chkino = ep_ino; 536 entryoffsetinblock = 0; 537 if (caching) { 538 /* 539 * Forget directory caching 540 * for this rare case 541 */ 542 dnlc_dir_purge(dcap); 543 caching = 0; 544 } 545 goto recheck; 546 } 547 } else if (dp->i_number == ep_ino) { 548 VN_HOLD(dvp); /* want ourself, "." */ 549 *ipp = dp; 550 if (caching) { 551 dnlc_dir_purge(dcap); 552 caching = 0; 553 } 554 } else { 555 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 556 RW_READER); 557 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 558 cr); 559 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 560 if (err) 561 goto bad; 562 } 563 checkok: 564 ASSERT(*ipp); 565 dnlc_update(dvp, namep, ITOV(*ipp)); 566 /* 567 * If we are not caching then just return the entry 568 * otherwise complete loading up the cache 569 */ 570 if (!caching) { 571 rw_exit(&dp->i_rwlock); 572 return (0); 573 } 574 err = blkatoff(dp, offset, (char **)0, &fbp); 575 if (err) 576 goto bad; 577 } 578 last_offset = offset; 579 offset += ep_reclen; 580 entryoffsetinblock += ep_reclen; 581 } 582 /* 583 * If we started in the middle of the directory and failed 584 * to find our target, we must check the beginning as well. 585 */ 586 if (numdirpasses == 2) { 587 numdirpasses--; 588 offset = 0; 589 endsearch = start_off; 590 goto searchloop; 591 } 592 593 /* 594 * If whole directory caching is on (or was originally on) then 595 * the entry may have been found. 596 */ 597 if (*ipp == NULL) { 598 err = ENOENT; 599 if (ufs_negative_cache && (dp->i_nlink > 0)) { 600 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 601 } 602 } 603 if (caching) { 604 dnlc_dir_complete(dcap); 605 caching = 0; 606 } 607 608 bad: 609 if (err && *ipp) { 610 /* 611 * err and *ipp can both be set if we were attempting to 612 * cache the directory, and we found the entry, then later 613 * while trying to complete the directory cache encountered 614 * a error (eg reading a directory sector). 615 */ 616 VN_RELE(ITOV(*ipp)); 617 *ipp = NULL; 618 } 619 620 if (fbp) 621 fbrelse(fbp, S_OTHER); 622 rw_exit(&dp->i_rwlock); 623 if (caching) 624 dnlc_dir_purge(dcap); 625 return (err); 626 } 627 628 /* 629 * Write a new directory entry for DE_CREATE or DE_MKDIR operations. 630 */ 631 int 632 ufs_direnter_cm( 633 struct inode *tdp, /* target directory to make entry in */ 634 char *namep, /* name of entry */ 635 enum de_op op, /* entry operation */ 636 struct vattr *vap, /* attributes if new inode needed */ 637 struct inode **ipp, /* return entered inode here */ 638 struct cred *cr, /* user credentials */ 639 int flags) /* no entry exists */ 640 { 641 struct inode *tip; /* inode of (existing) target file */ 642 char *s; 643 struct ufs_slot slot; /* slot info to pass around */ 644 int namlen; /* length of name */ 645 int err; /* error number */ 646 struct inode *nip; /* new inode */ 647 int do_rele_nip = 0; /* release nip */ 648 int noentry = flags & ~IQUIET; 649 int quiet = flags & IQUIET; /* Suppress out of inodes message */ 650 int indeadlock; 651 struct ulockfs *ulp; 652 653 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 654 655 if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) || 656 ((vap->va_type == VCHR) || (vap->va_type == VBLK) || 657 (vap->va_type == VDOOR) || (vap->va_type == VSOCK) || 658 (vap->va_type == VFIFO)))) 659 return (EINVAL); 660 661 /* don't allow '/' characters in pathname component */ 662 for (s = namep, namlen = 0; *s; s++, namlen++) 663 if (*s == '/') 664 return (EACCES); 665 ASSERT(namlen); 666 667 /* 668 * If name is "." or ".." then if this is a create look it up 669 * and return EEXIST. 670 */ 671 if (namep[0] == '.' && 672 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 673 /* 674 * ufs_dirlook will acquire the i_rwlock 675 */ 676 if (tdp->i_ufsvfs) 677 ulp = &tdp->i_ufsvfs->vfs_ulockfs; 678 rw_exit(&tdp->i_rwlock); 679 if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) { 680 if (err == EAGAIN) 681 return (err); 682 683 /* 684 * ufs_tryirwlock uses rw_tryenter and checks for 685 * SLOCK to avoid i_rwlock, ufs_lockfs_begin deadlock. 686 * If deadlock possible, retries the operation. 687 */ 688 ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry_err); 689 if (indeadlock) 690 return (EAGAIN); 691 692 return (err); 693 } 694 ufs_tryirwlock(&tdp->i_rwlock, RW_WRITER, retry); 695 if (indeadlock) { 696 VN_RELE(ITOV(*ipp)); 697 return (EAGAIN); 698 } 699 return (EEXIST); 700 } 701 702 /* 703 * If target directory has not been removed, then we can consider 704 * allowing file to be created. 705 */ 706 if (tdp->i_nlink <= 0) { 707 return (ENOENT); 708 } 709 710 /* 711 * Check accessibility of directory. 712 */ 713 if (((tdp->i_mode & IFMT) != IFDIR) && 714 ((tdp->i_mode & IFMT) != IFATTRDIR)) { 715 return (ENOTDIR); 716 } 717 718 /* 719 * Execute access is required to search the directory. 720 */ 721 if (err = ufs_iaccess(tdp, IEXEC, cr)) { 722 return (err); 723 } 724 725 /* 726 * Search for the entry. Return VN_HELD tip if found. 727 */ 728 tip = NULL; 729 slot.fbp = NULL; 730 slot.status = NONE; 731 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 732 rw_enter(&tdp->i_contents, RW_WRITER); 733 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry); 734 if (err) 735 goto out; 736 if (tip) { 737 ASSERT(!noentry); 738 *ipp = tip; 739 err = EEXIST; 740 } else { 741 /* 742 * The entry does not exist. Check write permission in 743 * directory to see if entry can be created. 744 */ 745 if (err = ufs_iaccess(tdp, IWRITE, cr)) 746 goto out; 747 /* 748 * Make new inode and directory entry. 749 */ 750 tdp->i_flag |= quiet; 751 if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) { 752 if (nip != NULL) 753 do_rele_nip = 1; 754 goto out; 755 } 756 if (err = ufs_diraddentry(tdp, namep, op, 757 namlen, &slot, nip, NULL, cr)) { 758 /* 759 * Unmake the inode we just made. 760 */ 761 rw_enter(&nip->i_contents, RW_WRITER); 762 if (((nip->i_mode & IFMT) == IFDIR) || 763 ((nip->i_mode & IFMT) == IFATTRDIR)) { 764 tdp->i_nlink--; 765 ufs_setreclaim(tdp); 766 tdp->i_flag |= ICHG; 767 tdp->i_seq++; 768 TRANS_INODE(tdp->i_ufsvfs, tdp); 769 ITIMES_NOLOCK(tdp); 770 } 771 nip->i_nlink = 0; 772 ufs_setreclaim(nip); 773 TRANS_INODE(nip->i_ufsvfs, nip); 774 nip->i_flag |= ICHG; 775 nip->i_seq++; 776 ITIMES_NOLOCK(nip); 777 rw_exit(&nip->i_contents); 778 do_rele_nip = 1; 779 } else { 780 *ipp = nip; 781 } 782 } 783 784 out: 785 if (slot.fbp) 786 fbrelse(slot.fbp, S_OTHER); 787 788 tdp->i_flag &= ~quiet; 789 rw_exit(&tdp->i_contents); 790 791 /* 792 * Drop vfs_dqrwlock before calling VN_RELE() on nip to 793 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 794 */ 795 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 796 797 if (do_rele_nip) { 798 VN_RELE(ITOV(nip)); 799 } 800 801 return (err); 802 } 803 804 /* 805 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations. 806 * If tvpp is non-null, return with the pointer to the target vnode. 807 */ 808 int 809 ufs_direnter_lr( 810 struct inode *tdp, /* target directory to make entry in */ 811 char *namep, /* name of entry */ 812 enum de_op op, /* entry operation */ 813 struct inode *sdp, /* source inode parent if rename */ 814 struct inode *sip, /* source inode */ 815 struct cred *cr, /* user credentials */ 816 vnode_t **tvpp) /* Return: (held) vnode of (existing) target */ 817 { 818 struct inode *tip; /* inode of (existing) target file */ 819 char *s; 820 struct ufs_slot slot; /* slot info to pass around */ 821 int namlen; /* length of name */ 822 int err; /* error number */ 823 824 /* don't allow '/' characters in pathname component */ 825 for (s = namep, namlen = 0; *s; s++, namlen++) 826 if (*s == '/') 827 return (EACCES); 828 ASSERT(namlen); 829 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 830 831 /* 832 * If name is "." or ".." then if this is a create look it up 833 * and return EEXIST. Rename or link TO "." or ".." is forbidden. 834 */ 835 if (namep[0] == '.' && 836 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 837 if (op == DE_RENAME) { 838 return (EINVAL); /* *SIGH* should be ENOTEMPTY */ 839 } 840 return (EEXIST); 841 } 842 /* 843 * For link and rename lock the source entry and check the link count 844 * to see if it has been removed while it was unlocked. If not, we 845 * increment the link count and force the inode to disk to make sure 846 * that it is there before any directory entry that points to it. 847 * 848 * In the case of a symbolic link, we are dealing with a new inode 849 * which does not yet have any links. We've created it with a link 850 * count of 1, and we don't want to increment it since this will be 851 * its first link. 852 * 853 * We are about to push the inode to disk. We make sure 854 * that the inode's data blocks are flushed first so the 855 * inode and it's data blocks are always in sync. This 856 * adds some robustness in in the event of a power failure 857 * or panic where sync fails. If we panic before the 858 * inode is updated, then the inode still refers to the 859 * old data blocks (or none for a new file). If we panic 860 * after the inode is updated, then the inode refers to 861 * the new data blocks. 862 * 863 * We do this before grabbing the i_contents lock because 864 * ufs_syncip() will want that lock. We could do the data 865 * syncing after the removal checks, but upon return from 866 * the data sync we would have to repeat the removal 867 * checks. 868 */ 869 if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) { 870 return (err); 871 } 872 873 rw_enter(&sip->i_contents, RW_WRITER); 874 if (sip->i_nlink <= 0) { 875 rw_exit(&sip->i_contents); 876 return (ENOENT); 877 } 878 if (sip->i_nlink == MAXLINK) { 879 rw_exit(&sip->i_contents); 880 return (EMLINK); 881 } 882 883 /* 884 * Sync the indirect blocks associated with the file 885 * for the same reasons as described above. Since this 886 * call wants the i_contents lock held for it we can do 887 * this here with no extra work. 888 */ 889 if (err = ufs_sync_indir(sip)) { 890 rw_exit(&sip->i_contents); 891 return (err); 892 } 893 894 if (op != DE_SYMLINK) 895 sip->i_nlink++; 896 TRANS_INODE(sip->i_ufsvfs, sip); 897 sip->i_flag |= ICHG; 898 sip->i_seq++; 899 ufs_iupdat(sip, I_SYNC); 900 rw_exit(&sip->i_contents); 901 902 /* 903 * If target directory has not been removed, then we can consider 904 * allowing file to be created. 905 */ 906 if (tdp->i_nlink <= 0) { 907 err = ENOENT; 908 goto out2; 909 } 910 /* 911 * Check accessibility of directory. 912 */ 913 if (((tdp->i_mode & IFMT) != IFDIR) && 914 (tdp->i_mode & IFMT) != IFATTRDIR) { 915 err = ENOTDIR; 916 goto out2; 917 } 918 /* 919 * Execute access is required to search the directory. 920 */ 921 if (err = ufs_iaccess(tdp, IEXEC, cr)) { 922 goto out2; 923 } 924 925 /* 926 * Search for the entry. Return VN_HELD tip if found. 927 */ 928 tip = NULL; 929 slot.status = NONE; 930 slot.fbp = NULL; 931 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 932 rw_enter(&tdp->i_contents, RW_WRITER); 933 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0); 934 if (err) 935 goto out; 936 937 if (tip) { 938 switch (op) { 939 case DE_RENAME: 940 err = ufs_dirrename(sdp, sip, tdp, namep, 941 tip, &slot, cr); 942 break; 943 944 case DE_LINK: 945 case DE_SYMLINK: 946 /* 947 * Can't link to an existing file. 948 */ 949 err = EEXIST; 950 break; 951 default: 952 break; 953 } 954 } else { 955 /* 956 * The entry does not exist. Check write permission in 957 * directory to see if entry can be created. 958 */ 959 if (err = ufs_iaccess(tdp, IWRITE, cr)) 960 goto out; 961 err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp, 962 cr); 963 } 964 965 out: 966 if (slot.fbp) 967 fbrelse(slot.fbp, S_OTHER); 968 969 rw_exit(&tdp->i_contents); 970 971 /* 972 * Drop vfs_dqrwlock before calling VN_RELE() on tip to 973 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 974 */ 975 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 976 977 /* 978 * If we renamed a file over the top of an existing file, 979 * or linked a file to an existing file (or tried to), 980 * then set *tvpp to the target vnode, if tvpp is non-null 981 * otherwise, release and delete (or just release) the inode. 982 * 983 * N.B., by returning the target's vnode pointer to the caller, 984 * that caller becomes responsible for doing the VN_RELE. 985 */ 986 if (tip) { 987 if ((err == 0) && (tvpp != NULL)) { 988 *tvpp = ITOV(tip); 989 } else { 990 VN_RELE(ITOV(tip)); 991 } 992 } 993 994 out2: 995 if (err) { 996 /* 997 * Undo bumped link count. 998 */ 999 if (op != DE_SYMLINK) { 1000 rw_enter(&sip->i_contents, RW_WRITER); 1001 sip->i_nlink--; 1002 ufs_setreclaim(sip); 1003 TRANS_INODE(sip->i_ufsvfs, sip); 1004 sip->i_flag |= ICHG; 1005 sip->i_seq++; 1006 ITIMES_NOLOCK(sip); 1007 rw_exit(&sip->i_contents); 1008 } 1009 } 1010 return (err); 1011 } 1012 1013 /* 1014 * Check for the existence of a name in a directory (unless noentry 1015 * is set) , or else of an empty 1016 * slot in which an entry may be made. If the requested name is found, 1017 * then on return *ipp points at the inode and *offp contains 1018 * its offset in the directory. If the name is not found, then *ipp 1019 * will be NULL and *slotp will contain information about a directory slot in 1020 * which an entry may be made (either an empty slot, or the first position 1021 * past the end of the directory). 1022 * The target directory inode (tdp) is supplied write locked (i_rwlock). 1023 * 1024 * This may not be used on "." or "..", but aliases of "." are ok. 1025 */ 1026 int 1027 ufs_dircheckforname( 1028 struct inode *tdp, /* inode of directory being checked */ 1029 char *namep, /* name we're checking for */ 1030 int namlen, /* length of name, excluding null */ 1031 struct ufs_slot *slotp, /* slot structure */ 1032 struct inode **ipp, /* return inode if we find one */ 1033 struct cred *cr, 1034 int noentry) /* noentry - just look for space */ 1035 { 1036 uint64_t handle; 1037 struct fbuf *fbp; /* pointer to directory block */ 1038 struct direct *ep; /* directory entry */ 1039 struct direct *nep; /* next directory entry */ 1040 dcanchor_t *dcap; 1041 vnode_t *dvp; /* directory vnode ptr */ 1042 off_t dirsize; /* size of the directory */ 1043 off_t offset; /* offset in the directory */ 1044 off_t last_offset; /* last offset */ 1045 off_t enduseful; /* pointer past last used dir slot */ 1046 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1047 int i; /* length of mangled entry */ 1048 int needed; 1049 int err; 1050 int first; 1051 int caching; 1052 int stat; 1053 ino_t ep_ino; 1054 slotstat_t initstat = slotp->status; 1055 1056 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1057 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1058 ASSERT(*ipp == NULL); 1059 fbp = NULL; 1060 1061 /* 1062 * First check if there is a complete cache of the directory. 1063 */ 1064 dvp = ITOV(tdp); 1065 1066 dcap = &tdp->i_danchor; 1067 if (noentry) { 1068 /* 1069 * We know from the 1st level dnlc cache that the entry 1070 * doesn't exist, so don't bother searching the directory 1071 * cache, but just look for space (possibly in the directory 1072 * cache). 1073 */ 1074 stat = DNOENT; 1075 } else { 1076 stat = dnlc_dir_lookup(dcap, namep, &handle); 1077 } 1078 switch (stat) { 1079 case DFOUND: 1080 ep_ino = (ino_t)H_TO_INO(handle); 1081 if (tdp->i_number == ep_ino) { 1082 *ipp = tdp; /* we want ourself, ie "." */ 1083 VN_HOLD(dvp); 1084 } else { 1085 err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr); 1086 if (err) 1087 return (err); 1088 } 1089 offset = H_TO_OFF(handle); 1090 first = 0; 1091 if (offset & 1) { 1092 /* This is the first entry in the block */ 1093 first = 1; 1094 offset -= 1; 1095 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1096 } 1097 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1098 if (err) { 1099 VN_RELE(ITOV(*ipp)); 1100 *ipp = NULL; 1101 return (err); 1102 } 1103 /* 1104 * Check the validity of the entry. 1105 * If it's bad, then throw away the cache and 1106 * continue without it. The dirmangled() routine 1107 * will then be called upon it. 1108 */ 1109 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1110 VN_RELE(ITOV(*ipp)); 1111 *ipp = NULL; 1112 dnlc_dir_purge(dcap); 1113 break; 1114 } 1115 /* 1116 * Remember the returned offset is the offset of the 1117 * preceding record (unless this is the 1st record 1118 * in the DIRBLKSIZ sized block (disk sector)), then it's 1119 * offset + 1. Note, no real offsets are on odd boundaries. 1120 */ 1121 if (first) { 1122 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1123 slotp->offset = offset; 1124 slotp->size = 0; 1125 slotp->ep = ep; 1126 } else { 1127 /* get the next entry */ 1128 nep = (struct direct *)((char *)ep + ep->d_reclen); 1129 /* 1130 * Check the validity of this entry as well 1131 * If it's bad, then throw away the cache and 1132 * continue without it. The dirmangled() routine 1133 * will then be called upon it. 1134 */ 1135 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1136 (nep->d_ino != ep_ino)) { 1137 VN_RELE(ITOV(*ipp)); 1138 *ipp = NULL; 1139 dnlc_dir_purge(dcap); 1140 break; 1141 } 1142 slotp->offset = offset + ep->d_reclen; 1143 slotp->size = ep->d_reclen; 1144 slotp->ep = nep; 1145 } 1146 slotp->status = EXIST; 1147 slotp->fbp = fbp; 1148 slotp->endoff = 0; 1149 slotp->cached = 1; 1150 dnlc_update(dvp, namep, ITOV(*ipp)); 1151 return (0); 1152 case DNOENT: 1153 /* 1154 * The caller gets to set the initial slot status to 1155 * indicate whether it's interested in getting a 1156 * empty slot. For example, the status can be set 1157 * to FOUND when an entry is being deleted. 1158 */ 1159 ASSERT(slotp->fbp == NULL); 1160 if (slotp->status == FOUND) { 1161 return (0); 1162 } 1163 switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen), 1164 &handle)) { 1165 case DFOUND: 1166 offset = (off_t)handle; 1167 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1168 if (err) { 1169 dnlc_dir_purge(dcap); 1170 ASSERT(*ipp == NULL); 1171 return (err); 1172 } 1173 /* 1174 * Check the validity of the entry. 1175 * If it's bad, then throw away the cache and 1176 * continue without it. The dirmangled() routine 1177 * will then be called upon it. 1178 */ 1179 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1180 dnlc_dir_purge(dcap); 1181 break; 1182 } 1183 /* 1184 * Remember the returned offset is the offset of the 1185 * containing record. 1186 */ 1187 slotp->status = FOUND; 1188 slotp->ep = ep; 1189 slotp->offset = offset; 1190 slotp->fbp = fbp; 1191 slotp->size = ep->d_reclen; 1192 /* 1193 * Set end offset to 0. Truncation is handled 1194 * because the dnlc cache will blow away the 1195 * cached directory when an entry is removed 1196 * that drops the entries left to less than half 1197 * the minumum number (dnlc_min_dir_cache). 1198 */ 1199 slotp->endoff = 0; 1200 slotp->cached = 1; 1201 return (0); 1202 case DNOENT: 1203 slotp->status = NONE; 1204 slotp->offset = P2ROUNDUP_TYPED(tdp->i_size, 1205 DIRBLKSIZ, u_offset_t); 1206 slotp->size = DIRBLKSIZ; 1207 slotp->endoff = 0; 1208 slotp->cached = 1; 1209 return (0); 1210 default: 1211 break; 1212 } 1213 break; 1214 } 1215 slotp->cached = 0; 1216 caching = NULL; 1217 if (!noentry && tdp->i_size >= ufs_min_dir_cache) { 1218 /* 1219 * if the directory caching disable time has expired 1220 * enable caching again. 1221 */ 1222 if (tdp->i_cachedir == CD_DISABLED_NOMEM && 1223 gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { 1224 ufs_dc_disable_at = 0; 1225 tdp->i_cachedir = CD_ENABLED; 1226 } 1227 /* 1228 * Attempt to cache any directories greater than the tunable 1229 * ufs_min_cache_dir. If it fails due to memory shortage 1230 * (DNOMEM), disable caching for this directory and record 1231 * the system time. Any attempt after the disable time has 1232 * expired will enable the caching again. 1233 */ 1234 if (tdp->i_cachedir == CD_ENABLED) { 1235 switch (dnlc_dir_start(dcap, 1236 tdp->i_size >> AV_DIRECT_SHIFT)) { 1237 case DNOMEM: 1238 tdp->i_cachedir = CD_DISABLED_NOMEM; 1239 ufs_dc_disable_at = gethrtime(); 1240 break; 1241 case DTOOBIG: 1242 tdp->i_cachedir = CD_DISABLED_TOOBIG; 1243 break; 1244 case DOK: 1245 caching = 1; 1246 break; 1247 default: 1248 break; 1249 } 1250 } 1251 } 1252 1253 /* 1254 * No point in using i_diroff since we must search whole directory 1255 */ 1256 dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t); 1257 enduseful = 0; 1258 offset = last_offset = 0; 1259 entryoffsetinblk = 0; 1260 needed = (int)LDIRSIZ(namlen); 1261 while (offset < dirsize) { 1262 /* 1263 * If offset is on a block boundary, 1264 * read the next directory block. 1265 * Release previous if it exists. 1266 */ 1267 if (blkoff(tdp->i_fs, offset) == 0) { 1268 if (fbp != NULL) 1269 fbrelse(fbp, S_OTHER); 1270 1271 err = blkatoff(tdp, offset, (char **)0, &fbp); 1272 if (err) { 1273 ASSERT(*ipp == NULL); 1274 if (caching) { 1275 dnlc_dir_purge(dcap); 1276 } 1277 return (err); 1278 } 1279 entryoffsetinblk = 0; 1280 } 1281 /* 1282 * If still looking for a slot, and at a DIRBLKSIZ 1283 * boundary, have to start looking for free space 1284 * again. 1285 */ 1286 if (slotp->status == NONE && 1287 (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) { 1288 slotp->offset = -1; 1289 } 1290 /* 1291 * If the next entry is a zero length record or if the 1292 * record length is invalid, then skip to the next 1293 * directory block. Complete validation checks are 1294 * done if the record length is invalid. 1295 * 1296 * Full validation checks are slow so they are disabled 1297 * by default. Complete checks can be run by patching 1298 * "dirchk" to be true. 1299 * 1300 * We do not have to check the validity of 1301 * entryoffsetinblk here because it starts out as zero 1302 * and is only incremented by d_reclen values that we 1303 * validate here. 1304 */ 1305 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1306 if (ep->d_reclen == 0 || 1307 (dirchk || (ep->d_reclen & 0x3)) && 1308 dirmangled(tdp, ep, entryoffsetinblk, offset)) { 1309 i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1)); 1310 offset += i; 1311 entryoffsetinblk += i; 1312 if (caching) { 1313 dnlc_dir_purge(dcap); 1314 caching = 0; 1315 } 1316 continue; 1317 } 1318 1319 /* 1320 * Add named entries and free space into the directory cache 1321 */ 1322 if (caching) { 1323 ushort_t extra; 1324 off_t off2; 1325 1326 if (ep->d_ino == 0) { 1327 extra = ep->d_reclen; 1328 if (offset & (DIRBLKSIZ - 1)) { 1329 dnlc_dir_purge(dcap); 1330 caching = 0; 1331 } 1332 } else { 1333 /* 1334 * entries hold the previous offset if 1335 * not the 1st one 1336 */ 1337 if (offset & (DIRBLKSIZ - 1)) { 1338 off2 = last_offset; 1339 } else { 1340 off2 = offset + 1; 1341 } 1342 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 1343 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 1344 extra = ep->d_reclen - DIRSIZ(ep); 1345 } 1346 if (caching && (extra >= LDIRSIZ(1))) { 1347 caching = (dnlc_dir_add_space(dcap, extra, 1348 (uint64_t)offset) == DOK); 1349 } 1350 } 1351 1352 /* 1353 * If an appropriate sized slot has not yet been found, 1354 * check to see if one is available. 1355 */ 1356 if ((slotp->status != FOUND) && (slotp->status != EXIST)) { 1357 int size = ep->d_reclen; 1358 1359 if (ep->d_ino != 0) 1360 size -= DIRSIZ(ep); 1361 if (size > 0) { 1362 if (size >= needed) { 1363 slotp->offset = offset; 1364 slotp->size = ep->d_reclen; 1365 if (noentry) { 1366 slotp->ep = ep; 1367 slotp->fbp = fbp; 1368 slotp->status = FOUND; 1369 slotp->endoff = 0; 1370 return (0); 1371 } 1372 slotp->status = FOUND; 1373 } else if (slotp->status == NONE) { 1374 if (slotp->offset == -1) 1375 slotp->offset = offset; 1376 } 1377 } 1378 } 1379 /* 1380 * Check for a name match. 1381 */ 1382 if (ep->d_ino && ep->d_namlen == namlen && 1383 *namep == *ep->d_name && /* fast chk 1st char */ 1384 bcmp(namep, ep->d_name, namlen) == 0) { 1385 1386 tdp->i_diroff = offset; 1387 1388 if (tdp->i_number == ep->d_ino) { 1389 *ipp = tdp; /* we want ourself, ie "." */ 1390 VN_HOLD(dvp); 1391 } else { 1392 err = ufs_iget_alloced(tdp->i_vfs, 1393 (ino_t)ep->d_ino, ipp, cr); 1394 if (err) { 1395 fbrelse(fbp, S_OTHER); 1396 if (caching) 1397 dnlc_dir_purge(dcap); 1398 return (err); 1399 } 1400 } 1401 slotp->status = EXIST; 1402 slotp->offset = offset; 1403 slotp->size = (int)(offset - last_offset); 1404 slotp->fbp = fbp; 1405 slotp->ep = ep; 1406 slotp->endoff = 0; 1407 if (caching) 1408 dnlc_dir_purge(dcap); 1409 return (0); 1410 } 1411 last_offset = offset; 1412 offset += ep->d_reclen; 1413 entryoffsetinblk += ep->d_reclen; 1414 if (ep->d_ino) 1415 enduseful = offset; 1416 } 1417 if (fbp) { 1418 fbrelse(fbp, S_OTHER); 1419 } 1420 1421 if (caching) { 1422 dnlc_dir_complete(dcap); 1423 slotp->cached = 1; 1424 if (slotp->status == FOUND) { 1425 if (initstat == FOUND) { 1426 return (0); 1427 } 1428 (void) dnlc_dir_rem_space_by_handle(dcap, 1429 slotp->offset); 1430 slotp->endoff = 0; 1431 return (0); 1432 } 1433 } 1434 1435 if (slotp->status == NONE) { 1436 /* 1437 * We didn't find a slot; the new directory entry should be put 1438 * at the end of the directory. Return an indication of where 1439 * this is, and set "endoff" to zero; since we're going to have 1440 * to extend the directory, we're certainly not going to 1441 * truncate it. 1442 */ 1443 slotp->offset = dirsize; 1444 slotp->size = DIRBLKSIZ; 1445 slotp->endoff = 0; 1446 } else { 1447 /* 1448 * We found a slot, and will return an indication of where that 1449 * slot is, as any new directory entry will be put there. 1450 * Since that slot will become a useful entry, if the last 1451 * useful entry we found was before this one, update the offset 1452 * of the last useful entry. 1453 */ 1454 if (enduseful < slotp->offset + slotp->size) 1455 enduseful = slotp->offset + slotp->size; 1456 slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t); 1457 } 1458 *ipp = NULL; 1459 return (0); 1460 } 1461 1462 uint64_t ufs_dirrename_retry_cnt; 1463 1464 /* 1465 * Rename the entry in the directory tdp so that it points to 1466 * sip instead of tip. 1467 */ 1468 static int 1469 ufs_dirrename( 1470 struct inode *sdp, /* parent directory of source */ 1471 struct inode *sip, /* source inode */ 1472 struct inode *tdp, /* parent directory of target */ 1473 char *namep, /* entry we are trying to change */ 1474 struct inode *tip, /* target inode */ 1475 struct ufs_slot *slotp, /* slot for entry */ 1476 struct cred *cr) /* credentials */ 1477 { 1478 vnode_t *tdvp; 1479 off_t offset; 1480 int err; 1481 int doingdirectory; 1482 1483 ASSERT(sdp->i_ufsvfs != NULL); 1484 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1485 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1486 /* 1487 * Short circuit rename of something to itself. 1488 */ 1489 if (sip->i_number == tip->i_number) { 1490 return (ESAME); /* special KLUDGE error code */ 1491 } 1492 1493 /* 1494 * We're locking 2 peer level locks, so must use tryenter 1495 * on the 2nd to avoid deadlocks that would occur 1496 * if we renamed a->b and b->a concurrently. 1497 */ 1498 retry: 1499 rw_enter(&tip->i_contents, RW_WRITER); 1500 if (!rw_tryenter(&sip->i_contents, RW_READER)) { 1501 /* 1502 * drop tip and wait (sleep) until we stand a chance 1503 * of holding sip 1504 */ 1505 rw_exit(&tip->i_contents); 1506 rw_enter(&sip->i_contents, RW_READER); 1507 /* 1508 * Reverse the lock grabs in case we have heavy 1509 * contention on the 2nd lock. 1510 */ 1511 if (!rw_tryenter(&tip->i_contents, RW_WRITER)) { 1512 ufs_dirrename_retry_cnt++; 1513 rw_exit(&sip->i_contents); 1514 goto retry; 1515 } 1516 } 1517 1518 /* 1519 * Check that everything is on the same filesystem. 1520 */ 1521 if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) || 1522 (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) { 1523 err = EXDEV; /* XXX archaic */ 1524 goto out; 1525 } 1526 /* 1527 * Must have write permission to rewrite target entry. 1528 * Perform additional checks for sticky directories. 1529 */ 1530 if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 || 1531 (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0) 1532 goto out; 1533 1534 /* 1535 * Ensure source and target are compatible (both directories 1536 * or both not directories). If target is a directory it must 1537 * be empty and have no links to it; in addition it must not 1538 * be a mount point, and both the source and target must be 1539 * writable. 1540 */ 1541 doingdirectory = (((sip->i_mode & IFMT) == IFDIR) || 1542 ((sip->i_mode & IFMT) == IFATTRDIR)); 1543 if (((tip->i_mode & IFMT) == IFDIR) || 1544 ((tip->i_mode & IFMT) == IFATTRDIR)) { 1545 if (!doingdirectory) { 1546 err = EISDIR; 1547 goto out; 1548 } 1549 /* 1550 * vn_vfsrlock will prevent mounts from using the directory 1551 * until we are done. 1552 */ 1553 if (vn_vfsrlock(ITOV(tip))) { 1554 err = EBUSY; 1555 goto out; 1556 } 1557 if (vn_mountedvfs(ITOV(tip)) != NULL) { 1558 vn_vfsunlock(ITOV(tip)); 1559 err = EBUSY; 1560 goto out; 1561 } 1562 if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) { 1563 vn_vfsunlock(ITOV(tip)); 1564 err = EEXIST; /* SIGH should be ENOTEMPTY */ 1565 goto out; 1566 } 1567 } else if (doingdirectory) { 1568 err = ENOTDIR; 1569 goto out; 1570 } 1571 1572 /* 1573 * Rewrite the inode pointer for target name entry 1574 * from the target inode (ip) to the source inode (sip). 1575 * This prevents the target entry from disappearing 1576 * during a crash. Mark the directory inode to reflect the changes. 1577 */ 1578 tdvp = ITOV(tdp); 1579 slotp->ep->d_ino = (int32_t)sip->i_number; 1580 dnlc_update(tdvp, namep, ITOV(sip)); 1581 if (slotp->size) { 1582 offset = slotp->offset - slotp->size; 1583 } else { 1584 offset = slotp->offset + 1; 1585 } 1586 if (slotp->cached) { 1587 (void) dnlc_dir_update(&tdp->i_danchor, namep, 1588 INO_OFF_TO_H(slotp->ep->d_ino, offset)); 1589 } 1590 1591 err = TRANS_DIR(tdp, slotp->offset); 1592 if (err) 1593 fbrelse(slotp->fbp, S_OTHER); 1594 else 1595 err = ufs_fbwrite(slotp->fbp, tdp); 1596 1597 slotp->fbp = NULL; 1598 if (err) { 1599 if (doingdirectory) 1600 vn_vfsunlock(ITOV(tip)); 1601 goto out; 1602 } 1603 1604 TRANS_INODE(tdp->i_ufsvfs, tdp); 1605 tdp->i_flag |= IUPD|ICHG; 1606 tdp->i_seq++; 1607 ITIMES_NOLOCK(tdp); 1608 1609 /* 1610 * Decrement the link count of the target inode. 1611 * Fix the ".." entry in sip to point to dp. 1612 * This is done after the new entry is on the disk. 1613 */ 1614 tip->i_nlink--; 1615 TRANS_INODE(tip->i_ufsvfs, tip); 1616 tip->i_flag |= ICHG; 1617 tip->i_seq++; 1618 ITIMES_NOLOCK(tip); 1619 if (doingdirectory) { 1620 /* 1621 * The entry for tip no longer exists so I can unlock the 1622 * vfslock. 1623 */ 1624 vn_vfsunlock(ITOV(tip)); 1625 /* 1626 * Decrement target link count once more if it was a directory. 1627 */ 1628 if (--tip->i_nlink != 0) { 1629 err = ufs_fault(ITOV(tip), 1630 "ufs_dirrename: target directory link count != 0 (%s)", 1631 tip->i_fs->fs_fsmnt); 1632 rw_exit(&tip->i_contents); 1633 return (err); 1634 } 1635 TRANS_INODE(tip->i_ufsvfs, tip); 1636 ufs_setreclaim(tip); 1637 /* 1638 * Renaming a directory with the parent different 1639 * requires that ".." be rewritten. The window is 1640 * still there for ".." to be inconsistent, but this 1641 * is unavoidable, and a lot shorter than when it was 1642 * done in a user process. We decrement the link 1643 * count in the new parent as appropriate to reflect 1644 * the just-removed target. If the parent is the 1645 * same, this is appropriate since the original 1646 * directory is going away. If the new parent is 1647 * different, ufs_dirfixdotdot() will bump the link count 1648 * back. 1649 */ 1650 tdp->i_nlink--; 1651 ufs_setreclaim(tdp); 1652 TRANS_INODE(tdp->i_ufsvfs, tdp); 1653 tdp->i_flag |= ICHG; 1654 tdp->i_seq++; 1655 ITIMES_NOLOCK(tdp); 1656 if (sdp != tdp) { 1657 rw_exit(&tip->i_contents); 1658 rw_exit(&sip->i_contents); 1659 err = ufs_dirfixdotdot(sip, sdp, tdp); 1660 return (err); 1661 } 1662 } else 1663 ufs_setreclaim(tip); 1664 out: 1665 rw_exit(&tip->i_contents); 1666 rw_exit(&sip->i_contents); 1667 return (err); 1668 } 1669 1670 /* 1671 * Fix the ".." entry of the child directory so that it points 1672 * to the new parent directory instead of the old one. Routine 1673 * assumes that dp is a directory and that all the inodes are on 1674 * the same file system. 1675 */ 1676 static int 1677 ufs_dirfixdotdot( 1678 struct inode *dp, /* child directory */ 1679 struct inode *opdp, /* old parent directory */ 1680 struct inode *npdp) /* new parent directory */ 1681 { 1682 struct fbuf *fbp; 1683 struct dirtemplate *dirp; 1684 vnode_t *dvp; 1685 int err; 1686 1687 ASSERT(RW_WRITE_HELD(&npdp->i_rwlock)); 1688 ASSERT(RW_WRITE_HELD(&npdp->i_contents)); 1689 1690 /* 1691 * We hold the child directory's i_contents lock before calling 1692 * blkatoff so that we honor correct locking protocol which is 1693 * i_contents lock and then page lock. (blkatoff will call 1694 * ufs_getpage where we want the page lock) 1695 * We hold the child directory's i_rwlock before i_contents (as 1696 * per the locking protocol) since we are modifying the ".." entry 1697 * of the child directory. 1698 * We hold the i_rwlock and i_contents lock until we record 1699 * this directory delta to the log (via ufs_trans_dir) and have 1700 * done fbrelse. 1701 */ 1702 rw_enter(&dp->i_rwlock, RW_WRITER); 1703 rw_enter(&dp->i_contents, RW_WRITER); 1704 err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp); 1705 if (err) 1706 goto bad; 1707 1708 if (dp->i_nlink <= 0 || 1709 dp->i_size < sizeof (struct dirtemplate)) { 1710 err = ENOENT; 1711 goto bad; 1712 } 1713 1714 if (dirp->dotdot_namlen != 2 || 1715 dirp->dotdot_name[0] != '.' || 1716 dirp->dotdot_name[1] != '.') { /* Sanity check. */ 1717 dirbad(dp, "mangled .. entry", (off_t)0); 1718 err = ENOTDIR; 1719 goto bad; 1720 } 1721 1722 /* 1723 * Increment the link count in the new parent inode and force it out. 1724 */ 1725 if (npdp->i_nlink == MAXLINK) { 1726 err = EMLINK; 1727 goto bad; 1728 } 1729 npdp->i_nlink++; 1730 TRANS_INODE(npdp->i_ufsvfs, npdp); 1731 npdp->i_flag |= ICHG; 1732 npdp->i_seq++; 1733 ufs_iupdat(npdp, I_SYNC); 1734 1735 /* 1736 * Rewrite the child ".." entry and force it out. 1737 */ 1738 dvp = ITOV(dp); 1739 dirp->dotdot_ino = (uint32_t)npdp->i_number; 1740 dnlc_update(dvp, "..", ITOV(npdp)); 1741 (void) dnlc_dir_update(&dp->i_danchor, "..", 1742 INO_OFF_TO_H(dirp->dotdot_ino, 0)); 1743 1744 err = TRANS_DIR(dp, 0); 1745 if (err) 1746 fbrelse(fbp, S_OTHER); 1747 else 1748 err = ufs_fbwrite(fbp, dp); 1749 1750 fbp = NULL; 1751 if (err) 1752 goto bad; 1753 1754 rw_exit(&dp->i_contents); 1755 rw_exit(&dp->i_rwlock); 1756 1757 /* 1758 * Decrement the link count of the old parent inode and force it out. 1759 */ 1760 ASSERT(opdp); 1761 rw_enter(&opdp->i_contents, RW_WRITER); 1762 ASSERT(opdp->i_nlink > 0); 1763 opdp->i_nlink--; 1764 ufs_setreclaim(opdp); 1765 TRANS_INODE(opdp->i_ufsvfs, opdp); 1766 opdp->i_flag |= ICHG; 1767 opdp->i_seq++; 1768 ufs_iupdat(opdp, I_SYNC); 1769 rw_exit(&opdp->i_contents); 1770 return (0); 1771 1772 bad: 1773 if (fbp) 1774 fbrelse(fbp, S_OTHER); 1775 rw_exit(&dp->i_contents); 1776 rw_exit(&dp->i_rwlock); 1777 return (err); 1778 } 1779 1780 /* 1781 * Enter the file sip in the directory tdp with name namep. 1782 */ 1783 static int 1784 ufs_diraddentry( 1785 struct inode *tdp, 1786 char *namep, 1787 enum de_op op, 1788 int namlen, 1789 struct ufs_slot *slotp, 1790 struct inode *sip, 1791 struct inode *sdp, 1792 struct cred *cr) 1793 { 1794 struct direct *ep, *nep; 1795 vnode_t *tdvp; 1796 dcanchor_t *dcap = &tdp->i_danchor; 1797 off_t offset; 1798 int err; 1799 ushort_t extra; 1800 1801 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1802 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1803 /* 1804 * Prepare a new entry. If the caller has not supplied an 1805 * existing inode, make a new one. 1806 */ 1807 err = dirprepareentry(tdp, slotp, cr); 1808 if (err) { 1809 if (slotp->fbp) { 1810 fbrelse(slotp->fbp, S_OTHER); 1811 slotp->fbp = NULL; 1812 } 1813 return (err); 1814 } 1815 /* 1816 * Check inode to be linked to see if it is in the 1817 * same filesystem. 1818 */ 1819 if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) { 1820 err = EXDEV; 1821 goto bad; 1822 } 1823 1824 /* 1825 * If renaming a directory then fix up the ".." entry in the 1826 * directory to point to the new parent. 1827 */ 1828 if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) || 1829 ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) { 1830 err = ufs_dirfixdotdot(sip, sdp, tdp); 1831 if (err) 1832 goto bad; 1833 } 1834 1835 /* 1836 * Fill in entry data. 1837 */ 1838 ep = slotp->ep; 1839 ep->d_namlen = (ushort_t)namlen; 1840 (void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3)); 1841 ep->d_ino = (uint32_t)sip->i_number; 1842 tdvp = ITOV(tdp); 1843 dnlc_update(tdvp, namep, ITOV(sip)); 1844 /* 1845 * Note the offset supplied for any named entry is 1846 * the offset of the previous one, unless it's the 1st. 1847 * slotp->size is used to pass the length to 1848 * the previous entry. 1849 */ 1850 if (slotp->size) { 1851 offset = slotp->offset - slotp->size; 1852 } else { 1853 offset = slotp->offset + 1; 1854 } 1855 1856 if (slotp->cached) { 1857 /* 1858 * Add back any usable unused space to the dnlc directory 1859 * cache. 1860 */ 1861 extra = ep->d_reclen - DIRSIZ(ep); 1862 if (extra >= LDIRSIZ(1)) { 1863 (void) dnlc_dir_add_space(dcap, extra, 1864 (uint64_t)slotp->offset); 1865 } 1866 1867 (void) dnlc_dir_add_entry(dcap, namep, 1868 INO_OFF_TO_H(ep->d_ino, offset)); 1869 1870 /* adjust the previous offset of the next entry */ 1871 nep = (struct direct *)((char *)ep + ep->d_reclen); 1872 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 1873 /* 1874 * Not a new block. 1875 * 1876 * Check the validity of the next entry. 1877 * If it's bad, then throw away the cache, and 1878 * continue as before directory caching. 1879 */ 1880 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1881 dnlc_dir_update(dcap, nep->d_name, 1882 INO_OFF_TO_H(nep->d_ino, slotp->offset)) 1883 == DNOENT) { 1884 dnlc_dir_purge(dcap); 1885 slotp->cached = 0; 1886 } 1887 } 1888 } 1889 1890 /* 1891 * Write out the directory block. 1892 */ 1893 err = TRANS_DIR(tdp, slotp->offset); 1894 if (err) 1895 fbrelse(slotp->fbp, S_OTHER); 1896 else 1897 err = ufs_fbwrite(slotp->fbp, tdp); 1898 1899 slotp->fbp = NULL; 1900 /* 1901 * If this is a rename of a directory, then we have already 1902 * fixed the ".." entry to refer to the new parent. If err 1903 * is true at this point, we have failed to update the new 1904 * parent to refer to the renamed directory. 1905 * XXX - we need to unwind the ".." fix. 1906 */ 1907 if (err) 1908 return (err); 1909 1910 /* 1911 * Mark the directory inode to reflect the changes. 1912 * Truncate the directory to chop off blocks of empty entries. 1913 */ 1914 1915 TRANS_INODE(tdp->i_ufsvfs, tdp); 1916 tdp->i_flag |= IUPD|ICHG; 1917 tdp->i_seq++; 1918 tdp->i_diroff = 0; 1919 ITIMES_NOLOCK(tdp); 1920 /* 1921 * If the directory grew then dirprepareentry() will have 1922 * set IATTCHG in tdp->i_flag, then the directory inode must 1923 * be flushed out. This is because if fsync() is used later 1924 * the directory size must be correct, otherwise a crash would 1925 * cause fsck to move the file to lost+found. Also because later 1926 * a file may be linked in more than one directory, then there 1927 * is no way to flush the original directory. So it must be 1928 * flushed out on creation. See bug 4293809. 1929 */ 1930 if (tdp->i_flag & IATTCHG) { 1931 ufs_iupdat(tdp, I_SYNC); 1932 } 1933 1934 if (slotp->endoff && (slotp->endoff < tdp->i_size)) { 1935 if (!TRANS_ISTRANS(tdp->i_ufsvfs)) { 1936 (void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0, 1937 cr); 1938 } 1939 } 1940 1941 1942 return (0); 1943 1944 bad: 1945 if (slotp->cached) { 1946 dnlc_dir_purge(dcap); 1947 fbrelse(slotp->fbp, S_OTHER); 1948 slotp->cached = 0; 1949 slotp->fbp = NULL; 1950 return (err); 1951 } 1952 1953 /* 1954 * Clear out entry prepared by dirprepareent. 1955 */ 1956 slotp->ep->d_ino = 0; 1957 slotp->ep->d_namlen = 0; 1958 1959 /* 1960 * Don't touch err so we don't clobber the real error that got us here. 1961 */ 1962 if (TRANS_DIR(tdp, slotp->offset)) 1963 fbrelse(slotp->fbp, S_OTHER); 1964 else 1965 (void) ufs_fbwrite(slotp->fbp, tdp); 1966 slotp->fbp = NULL; 1967 return (err); 1968 } 1969 1970 /* 1971 * Prepare a directory slot to receive an entry. 1972 */ 1973 static int 1974 dirprepareentry( 1975 struct inode *dp, /* directory we are working in */ 1976 struct ufs_slot *slotp, /* available slot info */ 1977 struct cred *cr) 1978 { 1979 struct direct *ep, *nep; 1980 off_t entryend; 1981 int err; 1982 slotstat_t status = slotp->status; 1983 ushort_t dsize; 1984 1985 ASSERT((status == NONE) || (status == FOUND)); 1986 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 1987 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 1988 /* 1989 * If we didn't find a slot, then indicate that the 1990 * new slot belongs at the end of the directory. 1991 * If we found a slot, then the new entry can be 1992 * put at slotp->offset. 1993 */ 1994 entryend = slotp->offset + slotp->size; 1995 if (status == NONE) { 1996 ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0); 1997 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 1998 err = ufs_fault(ITOV(dp), 1999 "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d" 2000 " > dp->i_fs->fs_fsize: %d (%s)", 2001 DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt); 2002 return (err); 2003 } 2004 /* 2005 * Allocate the new block. 2006 */ 2007 err = BMAPALLOC(dp, (u_offset_t)slotp->offset, 2008 (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr); 2009 if (err) { 2010 return (err); 2011 } 2012 dp->i_size = entryend; 2013 TRANS_INODE(dp->i_ufsvfs, dp); 2014 dp->i_flag |= IUPD|ICHG|IATTCHG; 2015 dp->i_seq++; 2016 ITIMES_NOLOCK(dp); 2017 } else if (entryend > dp->i_size) { 2018 /* 2019 * Adjust directory size, if needed. This should never 2020 * push the size past a new multiple of DIRBLKSIZ. 2021 * This is an artifact of the old (4.2BSD) way of initializing 2022 * directory sizes to be less than DIRBLKSIZ. 2023 */ 2024 dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t); 2025 TRANS_INODE(dp->i_ufsvfs, dp); 2026 dp->i_flag |= IUPD|ICHG|IATTCHG; 2027 dp->i_seq++; 2028 ITIMES_NOLOCK(dp); 2029 } 2030 2031 /* 2032 * Get the block containing the space for the new directory entry. 2033 */ 2034 if (slotp->fbp == NULL) { 2035 err = blkatoff(dp, slotp->offset, (char **)&slotp->ep, 2036 &slotp->fbp); 2037 if (err) { 2038 return (err); 2039 } 2040 } 2041 ep = slotp->ep; 2042 2043 switch (status) { 2044 case NONE: 2045 /* 2046 * No space in the directory. slotp->offset will be on a 2047 * directory block boundary and we will write the new entry 2048 * into a fresh block. 2049 */ 2050 ep->d_reclen = DIRBLKSIZ; 2051 slotp->size = 0; /* length of previous entry */ 2052 break; 2053 case FOUND: 2054 /* 2055 * An entry of the required size has been found. Use it. 2056 */ 2057 if (ep->d_ino == 0) { 2058 /* this is the 1st record in a block */ 2059 slotp->size = 0; /* length of previous entry */ 2060 } else { 2061 dsize = DIRSIZ(ep); 2062 nep = (struct direct *)((char *)ep + dsize); 2063 nep->d_reclen = ep->d_reclen - dsize; 2064 ep->d_reclen = dsize; 2065 slotp->ep = nep; 2066 slotp->offset += dsize; 2067 slotp->size = dsize; /* length of previous entry */ 2068 } 2069 break; 2070 default: 2071 break; 2072 } 2073 return (0); 2074 } 2075 2076 /* 2077 * Allocate and initialize a new inode that will go into directory tdp. 2078 * This routine is called from ufs_symlink(), as well as within this file. 2079 */ 2080 int 2081 ufs_dirmakeinode( 2082 struct inode *tdp, 2083 struct inode **ipp, 2084 struct vattr *vap, 2085 enum de_op op, 2086 struct cred *cr) 2087 { 2088 struct inode *ip; 2089 enum vtype type; 2090 int imode; /* mode and format as in inode */ 2091 ino_t ipref; 2092 int err; 2093 timestruc_t now; 2094 2095 ASSERT(vap != NULL); 2096 ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR || 2097 op == DE_SYMLINK); 2098 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 2099 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 2100 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 2101 /* 2102 * Allocate a new inode. 2103 */ 2104 type = vap->va_type; 2105 if (type == VDIR) { 2106 ipref = dirpref(tdp); 2107 } else { 2108 ipref = tdp->i_number; 2109 } 2110 if (op == DE_ATTRDIR) 2111 imode = vap->va_mode; 2112 else 2113 imode = MAKEIMODE(type, vap->va_mode); 2114 *ipp = NULL; 2115 err = ufs_ialloc(tdp, ipref, imode, &ip, cr); 2116 if (err) 2117 return (err); 2118 2119 /* 2120 * We don't need to grab vfs_dqrwlock here because it is held 2121 * in ufs_direnter_*() above us. 2122 */ 2123 ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock)); 2124 rw_enter(&ip->i_contents, RW_WRITER); 2125 if (ip->i_dquot != NULL) { 2126 err = ufs_fault(ITOV(ip), 2127 "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)", 2128 tdp->i_fs->fs_fsmnt); 2129 rw_exit(&ip->i_contents); 2130 return (err); 2131 } 2132 *ipp = ip; 2133 ip->i_mode = (o_mode_t)imode; 2134 if (type == VBLK || type == VCHR) { 2135 dev_t d = vap->va_rdev; 2136 dev32_t dev32; 2137 2138 /* 2139 * Don't allow a special file to be created with a 2140 * dev_t that cannot be represented by this filesystem 2141 * format on disk. 2142 */ 2143 if (!cmpldev(&dev32, d)) { 2144 err = EOVERFLOW; 2145 goto fail; 2146 } 2147 2148 ITOV(ip)->v_rdev = ip->i_rdev = d; 2149 2150 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 2151 ip->i_ordev = dev32; /* can't use old format */ 2152 } else { 2153 ip->i_ordev = cmpdev(d); 2154 } 2155 } 2156 ITOV(ip)->v_type = type; 2157 ufs_reset_vnode(ip->i_vnode); 2158 if (type == VDIR) { 2159 ip->i_nlink = 2; /* anticipating a call to dirmakedirect */ 2160 } else { 2161 ip->i_nlink = 1; 2162 } 2163 2164 if (op == DE_ATTRDIR) { 2165 ip->i_uid = vap->va_uid; 2166 ip->i_gid = vap->va_gid; 2167 } else 2168 ip->i_uid = crgetuid(cr); 2169 /* 2170 * To determine the group-id of the created file: 2171 * 1) If the gid is set in the attribute list (non-Sun & pre-4.0 2172 * clients are not likely to set the gid), then use it if 2173 * the process is privileged, belongs to the target group, 2174 * or the group is the same as the parent directory. 2175 * 2) If the filesystem was not mounted with the Old-BSD-compatible 2176 * GRPID option, and the directory's set-gid bit is clear, 2177 * then use the process's gid. 2178 * 3) Otherwise, set the group-id to the gid of the parent directory. 2179 */ 2180 if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) && 2181 ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) || 2182 secpolicy_vnode_create_gid(cr) == 0)) { 2183 /* 2184 * XXX - is this only the case when a 4.0 NFS client, or a 2185 * client derived from that code, makes a call over the wire? 2186 */ 2187 ip->i_gid = vap->va_gid; 2188 } else 2189 ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr); 2190 2191 /* 2192 * For SunOS 5.0->5.4, the lines below read: 2193 * 2194 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 2195 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 2196 * 2197 * where MAXUID was set to 60002. See notes on this in ufs_inode.c 2198 */ 2199 ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 2200 UID_LONG : ip->i_uid; 2201 ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 2202 GID_LONG : ip->i_gid; 2203 2204 /* 2205 * If we're creating a directory, and the parent directory has the 2206 * set-GID bit set, set it on the new directory. 2207 * Otherwise, if the user is neither privileged nor a member of the 2208 * file's new group, clear the file's set-GID bit. 2209 */ 2210 if ((tdp->i_mode & ISGID) && (type == VDIR)) 2211 ip->i_mode |= ISGID; 2212 else { 2213 if ((ip->i_mode & ISGID) && 2214 secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0) 2215 ip->i_mode &= ~ISGID; 2216 } 2217 2218 if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2219 ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2220 err = EOVERFLOW; 2221 goto fail; 2222 } 2223 2224 /* 2225 * Extended attribute directories are not subject to quotas. 2226 */ 2227 if (op != DE_ATTRDIR) 2228 ip->i_dquot = getinoquota(ip); 2229 else 2230 ip->i_dquot = NULL; 2231 2232 if (op == DE_MKDIR || op == DE_ATTRDIR) { 2233 err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr); 2234 if (err) 2235 goto fail; 2236 } 2237 2238 /* 2239 * generate the shadow inode and attach it to the new object 2240 */ 2241 ASSERT((tdp->i_shadow && tdp->i_ufs_acl) || 2242 (!tdp->i_shadow && !tdp->i_ufs_acl)); 2243 if (tdp->i_shadow && tdp->i_ufs_acl && 2244 (((tdp->i_mode & IFMT) == IFDIR) || 2245 ((tdp->i_mode & IFMT) == IFATTRDIR))) { 2246 err = ufs_si_inherit(ip, tdp, ip->i_mode, cr); 2247 if (err) { 2248 if (op == DE_MKDIR) { 2249 /* 2250 * clean up parent directory 2251 * 2252 * tdp->i_contents already locked from 2253 * ufs_direnter_*() 2254 */ 2255 tdp->i_nlink--; 2256 TRANS_INODE(tdp->i_ufsvfs, tdp); 2257 tdp->i_flag |= ICHG; 2258 tdp->i_seq++; 2259 ufs_iupdat(tdp, I_SYNC); 2260 } 2261 goto fail; 2262 } 2263 } 2264 2265 /* 2266 * If the passed in attributes contain atime and/or mtime 2267 * settings, then use them instead of using the current 2268 * high resolution time. 2269 */ 2270 if (vap->va_mask & (AT_MTIME|AT_ATIME)) { 2271 if (vap->va_mask & AT_ATIME) { 2272 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 2273 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2274 ip->i_flag &= ~IACC; 2275 } else 2276 ip->i_flag |= IACC; 2277 if (vap->va_mask & AT_MTIME) { 2278 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 2279 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2280 gethrestime(&now); 2281 if (now.tv_sec > TIME32_MAX) { 2282 /* 2283 * In 2038, ctime sticks forever.. 2284 */ 2285 ip->i_ctime.tv_sec = TIME32_MAX; 2286 ip->i_ctime.tv_usec = 0; 2287 } else { 2288 ip->i_ctime.tv_sec = now.tv_sec; 2289 ip->i_ctime.tv_usec = now.tv_nsec / 1000; 2290 } 2291 ip->i_flag &= ~(IUPD|ICHG); 2292 ip->i_flag |= IMODTIME; 2293 } else 2294 ip->i_flag |= IUPD|ICHG; 2295 ip->i_flag |= IMOD; 2296 } else 2297 ip->i_flag |= IACC|IUPD|ICHG; 2298 ip->i_seq++; 2299 2300 /* 2301 * If this is an attribute tag it as one. 2302 */ 2303 if ((tdp->i_mode & IFMT) == IFATTRDIR) { 2304 ip->i_cflags |= IXATTR; 2305 } 2306 2307 /* 2308 * push inode before it's name appears in a directory 2309 */ 2310 TRANS_INODE(ip->i_ufsvfs, ip); 2311 ufs_iupdat(ip, I_SYNC); 2312 rw_exit(&ip->i_contents); 2313 return (0); 2314 2315 fail: 2316 /* Throw away inode we just allocated. */ 2317 ip->i_nlink = 0; 2318 ufs_setreclaim(ip); 2319 TRANS_INODE(ip->i_ufsvfs, ip); 2320 ip->i_flag |= ICHG; 2321 ip->i_seq++; 2322 ITIMES_NOLOCK(ip); 2323 rw_exit(&ip->i_contents); 2324 return (err); 2325 } 2326 2327 /* 2328 * Write a prototype directory into the empty inode ip, whose parent is dp. 2329 */ 2330 static int 2331 ufs_dirmakedirect( 2332 struct inode *ip, /* new directory */ 2333 struct inode *dp, /* parent directory */ 2334 int attrdir, 2335 struct cred *cr) 2336 { 2337 struct dirtemplate *dirp; 2338 struct fbuf *fbp; 2339 int err; 2340 2341 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 2342 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2343 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2344 /* 2345 * Allocate space for the directory we're creating. 2346 */ 2347 err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr); 2348 if (err) 2349 return (err); 2350 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 2351 err = ufs_fault(ITOV(dp), 2352 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)", 2353 DIRBLKSIZ, dp->i_fs->fs_fsize, 2354 dp->i_fs->fs_fsmnt); 2355 return (err); 2356 } 2357 ip->i_size = DIRBLKSIZ; 2358 TRANS_INODE(ip->i_ufsvfs, ip); 2359 ip->i_flag |= IUPD|ICHG|IATTCHG; 2360 ip->i_seq++; 2361 ITIMES_NOLOCK(ip); 2362 /* 2363 * Update the tdp link count and write out the change. 2364 * This reflects the ".." entry we'll soon write. 2365 */ 2366 if (dp->i_nlink == MAXLINK) 2367 return (EMLINK); 2368 if (attrdir == 0) 2369 dp->i_nlink++; 2370 TRANS_INODE(dp->i_ufsvfs, dp); 2371 dp->i_flag |= ICHG; 2372 dp->i_seq++; 2373 ufs_iupdat(dp, I_SYNC); 2374 /* 2375 * Initialize directory with "." 2376 * and ".." from static template. 2377 * 2378 * Since the parent directory is locked, we don't have to 2379 * worry about anything changing when we drop the write 2380 * lock on (ip). 2381 * 2382 */ 2383 err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize, 2384 S_READ, &fbp); 2385 2386 if (err) { 2387 goto fail; 2388 } 2389 dirp = (struct dirtemplate *)fbp->fb_addr; 2390 /* 2391 * Now initialize the directory we're creating 2392 * with the "." and ".." entries. 2393 */ 2394 *dirp = mastertemplate; /* structure assignment */ 2395 dirp->dot_ino = (uint32_t)ip->i_number; 2396 dirp->dotdot_ino = (uint32_t)dp->i_number; 2397 2398 err = TRANS_DIR(ip, 0); 2399 if (err) { 2400 fbrelse(fbp, S_OTHER); 2401 goto fail; 2402 } 2403 2404 err = ufs_fbwrite(fbp, ip); 2405 if (err) { 2406 goto fail; 2407 } 2408 2409 return (0); 2410 2411 fail: 2412 if (attrdir == 0) 2413 dp->i_nlink--; 2414 TRANS_INODE(dp->i_ufsvfs, dp); 2415 dp->i_flag |= ICHG; 2416 dp->i_seq++; 2417 ufs_iupdat(dp, I_SYNC); 2418 return (err); 2419 } 2420 2421 /* 2422 * Delete a directory entry. If oip is nonzero the entry is checked 2423 * to make sure it still reflects oip. 2424 * 2425 * If vpp is non-null, return the ptr of the (held) vnode associated with 2426 * the removed name. The caller is responsible for doing the VN_RELE(). 2427 */ 2428 int 2429 ufs_dirremove( 2430 struct inode *dp, 2431 char *namep, 2432 struct inode *oip, 2433 struct vnode *cdir, 2434 enum dr_op op, 2435 struct cred *cr, 2436 vnode_t **vpp) /* Return (held) vnode ptr of removed file/dir */ 2437 { 2438 struct direct *ep, *pep, *nep; 2439 struct inode *ip; 2440 vnode_t *dvp, *vp; 2441 struct ufs_slot slot; 2442 int namlen; 2443 int err; 2444 int mode; 2445 ushort_t extra; 2446 2447 namlen = (int)strlen(namep); 2448 if (namlen == 0) 2449 return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0")); 2450 /* 2451 * return error when removing . and .. 2452 */ 2453 if (namep[0] == '.') { 2454 if (namlen == 1) 2455 return (EINVAL); 2456 else if (namlen == 2 && namep[1] == '.') { 2457 return (EEXIST); /* SIGH should be ENOTEMPTY */ 2458 } 2459 } 2460 2461 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2462 /* 2463 * Check accessibility of directory. 2464 */ 2465 retry: 2466 if (((dp->i_mode & IFMT) != IFDIR) && 2467 ((dp->i_mode & IFMT) != IFATTRDIR)) { 2468 return (ENOTDIR); 2469 } 2470 2471 /* 2472 * Execute access is required to search the directory. 2473 * Access for write is interpreted as allowing 2474 * deletion of files in the directory. 2475 */ 2476 if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) { 2477 return (err); 2478 } 2479 2480 ip = NULL; 2481 slot.fbp = NULL; 2482 slot.status = FOUND; /* don't need to look for empty slot */ 2483 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 2484 rw_enter(&dp->i_contents, RW_WRITER); 2485 err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0); 2486 if (err) 2487 goto out_novfs; 2488 if (ip == NULL) { 2489 err = ENOENT; 2490 goto out_novfs; 2491 } 2492 vp = ITOV(ip); 2493 if (oip && oip != ip) { 2494 err = ENOENT; 2495 goto out_novfs; 2496 } 2497 2498 mode = ip->i_mode & IFMT; 2499 if (mode == IFDIR || mode == IFATTRDIR) { 2500 2501 /* 2502 * vn_vfsrlock() prevents races between mount and rmdir. 2503 */ 2504 if (vn_vfsrlock(vp)) { 2505 err = EBUSY; 2506 goto out_novfs; 2507 } 2508 if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) { 2509 err = EBUSY; 2510 goto out; 2511 } 2512 /* 2513 * If we are removing a directory, get a lock on it. 2514 * Taking a writer lock prevents a parallel ufs_dirlook from 2515 * incorrectly entering a negative cache vnode entry in the dnlc 2516 * If the directory is empty, it will stay empty until 2517 * we can remove it. 2518 */ 2519 if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) { 2520 /* 2521 * It is possible that a thread in rename would have 2522 * acquired this rwlock. To prevent a deadlock we 2523 * do a rw_tryenter. If we fail to get the lock 2524 * we drop all the locks we have acquired, wait 2525 * for 2 ticks and reacquire the 2526 * directory's (dp) i_rwlock and try again. 2527 * If we dont drop dp's i_rwlock then we will panic 2528 * with a "Deadlock: cycle in blocking chain" 2529 * since in ufs_dircheckpath we want dp's i_rwlock. 2530 * dp is guaranteed to exist since ufs_dirremove is 2531 * called after a VN_HOLD(dp) has been done. 2532 */ 2533 ufs_dirremove_retry_cnt++; 2534 vn_vfsunlock(vp); 2535 if (slot.fbp) 2536 fbrelse(slot.fbp, S_OTHER); 2537 rw_exit(&dp->i_contents); 2538 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2539 rw_exit(&dp->i_rwlock); 2540 VN_RELE(vp); 2541 delay(2); 2542 rw_enter(&dp->i_rwlock, RW_WRITER); 2543 goto retry; 2544 } 2545 } 2546 rw_enter(&ip->i_contents, RW_READER); 2547 2548 /* 2549 * Now check the restrictions that apply on sticky directories. 2550 */ 2551 if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) { 2552 rw_exit(&ip->i_contents); 2553 if (mode == IFDIR || mode == IFATTRDIR) 2554 rw_exit(&ip->i_rwlock); 2555 goto out; 2556 } 2557 2558 if (op == DR_RMDIR) { 2559 /* 2560 * For rmdir(2), some special checks are required. 2561 * (a) Don't remove any alias of the parent (e.g. "."). 2562 * (b) Don't remove the current directory. 2563 * (c) Make sure the entry is (still) a directory. 2564 * (d) Make sure the directory is empty. 2565 */ 2566 2567 if (dp == ip || vp == cdir) 2568 err = EINVAL; 2569 else if (((ip->i_mode & IFMT) != IFDIR) && 2570 ((ip->i_mode & IFMT) != IFATTRDIR)) 2571 err = ENOTDIR; 2572 else if ((ip->i_nlink > 2) || 2573 !ufs_dirempty(ip, dp->i_number, cr)) { 2574 err = EEXIST; /* SIGH should be ENOTEMPTY */ 2575 } 2576 2577 if (err) { 2578 rw_exit(&ip->i_contents); 2579 if (mode == IFDIR || mode == IFATTRDIR) 2580 rw_exit(&ip->i_rwlock); 2581 goto out; 2582 } 2583 } else if (op == DR_REMOVE) { 2584 /* 2585 * unlink(2) requires a different check: allow only 2586 * privileged users to unlink a directory. 2587 */ 2588 if (vp->v_type == VDIR && 2589 secpolicy_fs_linkdir(cr, vp->v_vfsp)) { 2590 err = EPERM; 2591 rw_exit(&ip->i_contents); 2592 rw_exit(&ip->i_rwlock); 2593 goto out; 2594 } 2595 } 2596 2597 rw_exit(&ip->i_contents); 2598 2599 /* 2600 * Remove the cache'd entry, if any. 2601 */ 2602 dvp = ITOV(dp); 2603 dnlc_remove(dvp, namep); 2604 ep = slot.ep; 2605 ep->d_ino = 0; 2606 2607 if (slot.cached) { 2608 dcanchor_t *dcap = &dp->i_danchor; 2609 2610 (void) dnlc_dir_rem_entry(dcap, namep, NULL); 2611 if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) { 2612 (void) dnlc_dir_rem_space_by_handle(dcap, slot.offset); 2613 } 2614 if (slot.offset & (DIRBLKSIZ - 1)) { 2615 /* 2616 * Collapse new free space into previous entry. 2617 * Note, the previous entry has already been 2618 * validated in ufs_dircheckforname(). 2619 */ 2620 ASSERT(slot.size); 2621 pep = (struct direct *)((char *)ep - slot.size); 2622 if ((pep->d_ino == 0) && 2623 ((uintptr_t)pep & (DIRBLKSIZ - 1))) { 2624 dnlc_dir_purge(dcap); 2625 slot.cached = 0; 2626 goto nocache; 2627 } 2628 if (pep->d_ino) { 2629 extra = pep->d_reclen - DIRSIZ(pep); 2630 } else { 2631 extra = pep->d_reclen; 2632 } 2633 if (extra >= LDIRSIZ(1)) { 2634 (void) dnlc_dir_rem_space_by_handle(dcap, 2635 (uint64_t)(slot.offset - slot.size)); 2636 } 2637 pep->d_reclen += ep->d_reclen; 2638 (void) dnlc_dir_add_space(dcap, extra + ep->d_reclen, 2639 (uint64_t)(slot.offset - slot.size)); 2640 /* adjust the previous pointer in the next entry */ 2641 nep = (struct direct *)((char *)ep + ep->d_reclen); 2642 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 2643 /* 2644 * Not a new block. 2645 * 2646 * Check the validity of the entry. 2647 * If it's bad, then throw away the cache and 2648 * continue. 2649 */ 2650 if ((nep->d_reclen == 0) || 2651 (nep->d_reclen & 0x3) || 2652 (dnlc_dir_update(dcap, nep->d_name, 2653 INO_OFF_TO_H(nep->d_ino, 2654 slot.offset - slot.size)) == DNOENT)) { 2655 dnlc_dir_purge(dcap); 2656 slot.cached = 0; 2657 } 2658 } 2659 } else { 2660 (void) dnlc_dir_add_space(dcap, ep->d_reclen, 2661 (uint64_t)slot.offset); 2662 } 2663 } else { 2664 /* 2665 * If the entry isn't the first in the directory, we must 2666 * reclaim the space of the now empty record by adding 2667 * the record size to the size of the previous entry. 2668 */ 2669 if (slot.offset & (DIRBLKSIZ - 1)) { 2670 /* 2671 * Collapse new free space into previous entry. 2672 */ 2673 pep = (struct direct *)((char *)ep - slot.size); 2674 pep->d_reclen += ep->d_reclen; 2675 } 2676 } 2677 nocache: 2678 2679 2680 err = TRANS_DIR(dp, slot.offset); 2681 if (err) 2682 fbrelse(slot.fbp, S_OTHER); 2683 else 2684 err = ufs_fbwrite(slot.fbp, dp); 2685 slot.fbp = NULL; 2686 2687 /* 2688 * If we were removing a directory, it is 'gone' now, but we cannot 2689 * unlock it as a thread may be waiting for the lock in ufs_create. If 2690 * we did, it could then create a file in a deleted directory. 2691 */ 2692 2693 if (err) { 2694 if (mode == IFDIR || mode == IFATTRDIR) 2695 rw_exit(&ip->i_rwlock); 2696 goto out; 2697 } 2698 2699 rw_enter(&ip->i_contents, RW_WRITER); 2700 2701 dp->i_flag |= IUPD|ICHG; 2702 dp->i_seq++; 2703 ip->i_flag |= ICHG; 2704 ip->i_seq++; 2705 2706 TRANS_INODE(dp->i_ufsvfs, dp); 2707 TRANS_INODE(ip->i_ufsvfs, ip); 2708 /* 2709 * Now dispose of the inode. 2710 */ 2711 if (ip->i_nlink > 0) { 2712 /* 2713 * This is not done for IFATTRDIR's because they don't 2714 * have entries in the dnlc and the link counts are 2715 * not incremented when they are created. 2716 */ 2717 if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) { 2718 /* 2719 * Decrement by 2 because we're trashing the "." 2720 * entry as well as removing the entry in dp. 2721 * Clear the directory entry, but there may be 2722 * other hard links so don't free the inode. 2723 * Decrement the dp linkcount because we're 2724 * trashing the ".." entry. 2725 */ 2726 ip->i_nlink -= 2; 2727 dp->i_nlink--; 2728 ufs_setreclaim(dp); 2729 /* 2730 * XXX need to discard negative cache entries 2731 * for vp. See comment in ufs_delete(). 2732 */ 2733 dnlc_remove(vp, "."); 2734 dnlc_remove(vp, ".."); 2735 /* 2736 * The return value is ignored here bacause if 2737 * the directory purge fails we don't want to 2738 * stop the delete. If ufs_dirpurgedotdot fails 2739 * the delete will continue with the preexiting 2740 * behavior. 2741 */ 2742 (void) ufs_dirpurgedotdot(ip, dp->i_number, cr); 2743 } else { 2744 ip->i_nlink--; 2745 } 2746 ufs_setreclaim(ip); 2747 } 2748 ITIMES_NOLOCK(dp); 2749 ITIMES_NOLOCK(ip); 2750 2751 if (!TRANS_ISTRANS(dp->i_ufsvfs)) 2752 ufs_iupdat(dp, I_SYNC); 2753 if (!TRANS_ISTRANS(ip->i_ufsvfs)) 2754 ufs_iupdat(ip, I_SYNC); 2755 2756 rw_exit(&ip->i_contents); 2757 if (mode == IFDIR || mode == IFATTRDIR) 2758 rw_exit(&ip->i_rwlock); 2759 out: 2760 if (mode == IFDIR || mode == IFATTRDIR) { 2761 vn_vfsunlock(vp); 2762 } 2763 out_novfs: 2764 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2765 2766 if (slot.fbp) 2767 fbrelse(slot.fbp, S_OTHER); 2768 2769 rw_exit(&dp->i_contents); 2770 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2771 2772 /* 2773 * If no error and vpp is non-NULL, return the vnode ptr to the caller. 2774 * The caller becomes responsible for the VN_RELE(). Otherwise, 2775 * Release (and delete) the inode after we drop vfs_dqrwlock to 2776 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 2777 */ 2778 if (ip) { 2779 if ((err == 0) && (vpp != NULL)) { 2780 *vpp = ITOV(ip); 2781 } else { 2782 VN_RELE(vp); 2783 } 2784 } 2785 2786 return (err); 2787 } 2788 2789 /* 2790 * Return buffer with contents of block "offset" 2791 * from the beginning of directory "ip". If "res" 2792 * is non-zero, fill it in with a pointer to the 2793 * remaining space in the directory. 2794 * 2795 */ 2796 2797 int 2798 blkatoff( 2799 struct inode *ip, 2800 off_t offset, 2801 char **res, 2802 struct fbuf **fbpp) 2803 { 2804 struct fs *fs; 2805 struct fbuf *fbp; 2806 daddr_t lbn; 2807 uint_t bsize; 2808 int err; 2809 2810 CPU_STATS_ADD_K(sys, ufsdirblk, 1); 2811 fs = ip->i_fs; 2812 lbn = (daddr_t)lblkno(fs, offset); 2813 bsize = (uint_t)blksize(fs, ip, lbn); 2814 err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask), 2815 bsize, S_READ, &fbp); 2816 if (err) { 2817 *fbpp = (struct fbuf *)NULL; 2818 return (err); 2819 } 2820 if (res) 2821 *res = fbp->fb_addr + blkoff(fs, offset); 2822 *fbpp = fbp; 2823 return (0); 2824 } 2825 2826 /* 2827 * Do consistency checking: 2828 * record length must be multiple of 4 2829 * entry must fit in rest of its DIRBLKSIZ block 2830 * record must be large enough to contain entry 2831 * name is not longer than MAXNAMLEN 2832 * name must be as long as advertised, and null terminated 2833 * NOTE: record length must not be zero (should be checked previously). 2834 * This routine is only called if dirchk is true. 2835 * It would be nice to set the FSBAD flag in the super-block when 2836 * this routine fails so that a fsck is forced on next reboot, 2837 * but locking is a problem. 2838 */ 2839 static int 2840 dirmangled( 2841 struct inode *dp, 2842 struct direct *ep, 2843 int entryoffsetinblock, 2844 off_t offset) 2845 { 2846 int i; 2847 2848 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 2849 if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i || 2850 (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN || 2851 ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) { 2852 dirbad(dp, "mangled entry", offset); 2853 return (1); 2854 } 2855 return (0); 2856 } 2857 2858 static void 2859 dirbad(struct inode *ip, char *how, off_t offset) 2860 { 2861 cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s", 2862 ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how); 2863 } 2864 2865 static int 2866 dirbadname(char *sp, int l) 2867 { 2868 while (l--) { /* check for nulls */ 2869 if (*sp++ == '\0') { 2870 return (1); 2871 } 2872 } 2873 return (*sp); /* check for terminating null */ 2874 } 2875 2876 /* 2877 * Check if a directory is empty or not. 2878 */ 2879 static int 2880 ufs_dirempty( 2881 struct inode *ip, 2882 ino_t parentino, 2883 struct cred *cr) 2884 { 2885 return (ufs_dirscan(ip, parentino, cr, 0)); 2886 } 2887 2888 /* 2889 * clear the .. directory entry. 2890 */ 2891 static int 2892 ufs_dirpurgedotdot( 2893 struct inode *ip, 2894 ino_t parentino, 2895 struct cred *cr) 2896 { 2897 return (ufs_dirscan(ip, parentino, cr, 1)); 2898 } 2899 2900 /* 2901 * Scan the directoy. If clr_dotdot is true clear the .. 2902 * directory else check to see if the directory is empty. 2903 * 2904 * Using a struct dirtemplate here is not precisely 2905 * what we want, but better than using a struct direct. 2906 * 2907 * clr_dotdot is used as a flag to tell us if we need 2908 * to clear the dotdot entry 2909 * 2910 * N.B.: does not handle corrupted directories. 2911 */ 2912 static int 2913 ufs_dirscan( 2914 struct inode *ip, 2915 ino_t parentino, 2916 struct cred *cr, 2917 int clr_dotdot) 2918 { 2919 offset_t off; 2920 struct dirtemplate dbuf; 2921 struct direct *dp = (struct direct *)&dbuf; 2922 int err, count; 2923 int empty = 1; /* Assume it's empty */ 2924 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 2925 2926 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 2927 2928 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 2929 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 2930 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 2931 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 2932 /* 2933 * Since we read MINDIRSIZ, residual must 2934 * be 0 unless we're at end of file. 2935 */ 2936 if (err || count != 0 || dp->d_reclen == 0) { 2937 empty = 0; 2938 break; 2939 } 2940 /* skip empty entries */ 2941 if (dp->d_ino == 0) 2942 continue; 2943 /* accept only "." and ".." */ 2944 if (dp->d_namlen > 2 || dp->d_name[0] != '.') { 2945 empty = 0; 2946 break; 2947 } 2948 /* 2949 * At this point d_namlen must be 1 or 2. 2950 * 1 implies ".", 2 implies ".." if second 2951 * char is also "." 2952 */ 2953 if (dp->d_namlen == 1) 2954 continue; 2955 if (dp->d_name[1] == '.' && 2956 (ino_t)dp->d_ino == parentino) { 2957 /* 2958 * If we're doing a purge we need to check for 2959 * the . and .. entries and clear the d_ino for .. 2960 * 2961 * if clr_dotdot is set ufs_dirscan does not 2962 * check for an empty directory. 2963 */ 2964 if (clr_dotdot) { 2965 /* 2966 * Have to actually zap the .. 2967 * entry in the directory, as 2968 * otherwise someone might have 2969 * dp as its cwd and try to 2970 * open .., which now points to 2971 * an unallocated inode. 2972 */ 2973 empty = ufs_dirclrdotdot(ip, parentino); 2974 break; 2975 } else { 2976 continue; 2977 } 2978 } 2979 empty = 0; 2980 break; 2981 } 2982 return (empty); 2983 } 2984 2985 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */ 2986 uint64_t dircheck_retry_cnt; 2987 /* 2988 * Check if source directory inode is in the path of the target directory. 2989 * Target is supplied locked. 2990 * 2991 * The source and target inode's should be different upon entry. 2992 */ 2993 int 2994 ufs_dircheckpath( 2995 ino_t source_ino, 2996 struct inode *target, 2997 struct inode *sdp, 2998 struct cred *cr) 2999 { 3000 struct fbuf *fbp; 3001 struct dirtemplate *dirp; 3002 struct inode *ip; 3003 struct ufsvfs *ufsvfsp; 3004 struct inode *tip; 3005 ino_t dotdotino; 3006 int err; 3007 3008 ASSERT(target->i_ufsvfs != NULL); 3009 ASSERT(RW_LOCK_HELD(&target->i_rwlock)); 3010 ASSERT(RW_LOCK_HELD(&sdp->i_rwlock)); 3011 3012 ip = target; 3013 if (ip->i_number == source_ino) { 3014 err = EINVAL; 3015 goto out; 3016 } 3017 if (ip->i_number == UFSROOTINO) { 3018 err = 0; 3019 goto out; 3020 } 3021 /* 3022 * Search back through the directory tree, using the ".." entries. 3023 * Fail any attempt to move a directory into an ancestor directory. 3024 */ 3025 fbp = NULL; 3026 for (;;) { 3027 struct vfs *vfs; 3028 3029 err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp); 3030 if (err) 3031 break; 3032 if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 || 3033 ip->i_size < sizeof (struct dirtemplate)) { 3034 dirbad(ip, "bad size, unlinked or not dir", (off_t)0); 3035 err = ENOTDIR; 3036 break; 3037 } 3038 if (dirp->dotdot_namlen != 2 || 3039 dirp->dotdot_name[0] != '.' || 3040 dirp->dotdot_name[1] != '.') { 3041 dirbad(ip, "mangled .. entry", (off_t)0); 3042 err = ENOTDIR; /* Sanity check */ 3043 break; 3044 } 3045 dotdotino = (ino_t)dirp->dotdot_ino; 3046 if (dotdotino == source_ino) { 3047 err = EINVAL; 3048 break; 3049 } 3050 if (dotdotino == UFSROOTINO) 3051 break; 3052 if (fbp) { 3053 fbrelse(fbp, S_OTHER); 3054 fbp = NULL; 3055 } 3056 vfs = ip->i_vfs; 3057 ufsvfsp = ip->i_ufsvfs; 3058 3059 if (ip != target) { 3060 rw_exit(&ip->i_rwlock); 3061 VN_RELE(ITOV(ip)); 3062 } 3063 /* 3064 * Race to get the inode. 3065 */ 3066 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3067 if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) { 3068 rw_exit(&ufsvfsp->vfs_dqrwlock); 3069 ip = NULL; 3070 break; 3071 } 3072 rw_exit(&ufsvfsp->vfs_dqrwlock); 3073 /* 3074 * If the directory of the source inode (also a directory) 3075 * is the same as this next entry up the chain, then 3076 * we know the source directory itself can't be in the 3077 * chain. This also prevents a panic because we already 3078 * have sdp->i_rwlock locked. 3079 */ 3080 if (tip == sdp) { 3081 VN_RELE(ITOV(tip)); 3082 ip = NULL; 3083 break; 3084 } 3085 ip = tip; 3086 3087 /* 3088 * If someone has set the WRITE_WANTED bit in this lock and if 3089 * this happens to be a sdp or tdp of another parallel rename 3090 * which is executing the same code and in similar situation 3091 * we end up in a 4 way deadlock. We need to make sure that 3092 * the WRITE_WANTED bit is not set. 3093 */ 3094 retry_lock: 3095 if (!rw_tryenter(&ip->i_rwlock, RW_READER)) { 3096 /* 3097 * If the lock held as WRITER thats fine but if it 3098 * has WRITE_WANTED bit set we might end up in a 3099 * deadlock. If WRITE_WANTED is set we return 3100 * with EAGAIN else we just go back and try. 3101 */ 3102 if (RW_ISWRITER(&ip->i_rwlock) && 3103 !(RW_WRITE_HELD(&ip->i_rwlock))) { 3104 err = EAGAIN; 3105 if (fbp) { 3106 fbrelse(fbp, S_OTHER); 3107 } 3108 VN_RELE(ITOV(ip)); 3109 return (err); 3110 } else { 3111 /* 3112 * The lock is being write held. We could 3113 * just do a rw_enter here but there is a 3114 * window between the check and now, where 3115 * the status could have changed, so to 3116 * avoid looping we backoff and go back to 3117 * try for the lock. 3118 */ 3119 delay(retry_backoff_delay); 3120 dircheck_retry_cnt++; 3121 goto retry_lock; 3122 } 3123 } 3124 } 3125 if (fbp) { 3126 fbrelse(fbp, S_OTHER); 3127 } 3128 out: 3129 if (ip) { 3130 if (ip != target) { 3131 rw_exit(&ip->i_rwlock); 3132 VN_RELE(ITOV(ip)); 3133 } 3134 } 3135 return (err); 3136 } 3137 3138 int 3139 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr) 3140 { 3141 offset_t off; 3142 struct dirtemplate dbuf; 3143 struct direct *dp = (struct direct *)&dbuf; 3144 int err, count; 3145 int empty = 1; /* Assume it's empty */ 3146 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 3147 3148 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3149 3150 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 3151 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 3152 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 3153 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 3154 /* 3155 * Since we read MINDIRSIZ, residual must 3156 * be 0 unless we're at end of file. 3157 */ 3158 3159 if (err || count != 0 || dp->d_reclen == 0) { 3160 empty = 0; 3161 break; 3162 } 3163 /* skip empty entries */ 3164 if (dp->d_ino == 0) 3165 continue; 3166 /* 3167 * At this point d_namlen must be 1 or 2. 3168 * 1 implies ".", 2 implies ".." if second 3169 * char is also "." 3170 */ 3171 3172 if (dp->d_namlen == 1 && dp->d_name[0] == '.' && 3173 (ino_t)dp->d_ino == parentino) 3174 continue; 3175 3176 if (dp->d_namlen == 2 && dp->d_name[0] == '.' && 3177 dp->d_name[1] == '.') { 3178 continue; 3179 } 3180 empty = 0; 3181 break; 3182 } 3183 return (empty); 3184 } 3185 3186 3187 /* 3188 * Allocate and initialize a new shadow inode to contain extended attributes. 3189 */ 3190 int 3191 ufs_xattrmkdir( 3192 struct inode *tdp, 3193 struct inode **ipp, 3194 int flags, 3195 struct cred *cr) 3196 { 3197 struct inode *ip; 3198 struct vattr va; 3199 int err; 3200 int retry = 1; 3201 struct ufsvfs *ufsvfsp; 3202 struct ulockfs *ulp; 3203 int issync; 3204 int trans_size; 3205 int dorwlock; /* 0 = not yet taken, */ 3206 /* 1 = taken outside the transaction, */ 3207 /* 2 = taken inside the transaction */ 3208 3209 /* 3210 * Validate permission to create attribute directory 3211 */ 3212 3213 if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) { 3214 return (err); 3215 } 3216 3217 if (vn_is_readonly(ITOV(tdp))) 3218 return (EROFS); 3219 3220 /* 3221 * No need to re-init err after again:, since it's set before 3222 * the next use of it. 3223 */ 3224 again: 3225 dorwlock = 0; 3226 va.va_type = VDIR; 3227 va.va_uid = tdp->i_uid; 3228 va.va_gid = tdp->i_gid; 3229 3230 if ((tdp->i_mode & IFMT) == IFDIR) { 3231 va.va_mode = (o_mode_t)IFATTRDIR; 3232 va.va_mode |= tdp->i_mode & 0777; 3233 } else { 3234 va.va_mode = (o_mode_t)IFATTRDIR|0700; 3235 if (tdp->i_mode & 0040) 3236 va.va_mode |= 0750; 3237 if (tdp->i_mode & 0004) 3238 va.va_mode |= 0705; 3239 } 3240 va.va_mask = AT_TYPE|AT_MODE; 3241 3242 ufsvfsp = tdp->i_ufsvfs; 3243 3244 err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); 3245 if (err) 3246 return (err); 3247 3248 /* 3249 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. 3250 * This follows the protocol for read()/write(). 3251 */ 3252 if (ITOV(tdp)->v_type != VDIR) { 3253 rw_enter(&tdp->i_rwlock, RW_WRITER); 3254 dorwlock = 1; 3255 } 3256 3257 if (ulp) { 3258 trans_size = (int)TOP_MKDIR_SIZE(tdp); 3259 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size); 3260 } 3261 3262 /* 3263 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. 3264 * This follows the protocol established by 3265 * ufs_link/create/remove/rename/mkdir/rmdir/symlink. 3266 */ 3267 if (dorwlock == 0) { 3268 rw_enter(&tdp->i_rwlock, RW_WRITER); 3269 dorwlock = 2; 3270 } 3271 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3272 rw_enter(&tdp->i_contents, RW_WRITER); 3273 3274 /* 3275 * Suppress out of inodes messages if we will retry. 3276 */ 3277 if (retry) 3278 tdp->i_flag |= IQUIET; 3279 err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr); 3280 tdp->i_flag &= ~IQUIET; 3281 3282 if (err) 3283 goto fail; 3284 3285 if (flags) { 3286 3287 /* 3288 * Now attach it to src file. 3289 */ 3290 3291 tdp->i_oeftflag = ip->i_number; 3292 } 3293 3294 ip->i_cflags |= IXATTR; 3295 ITOV(ip)->v_flag |= V_XATTRDIR; 3296 TRANS_INODE(ufsvfsp, tdp); 3297 tdp->i_flag |= ICHG | IUPD; 3298 tdp->i_seq++; 3299 ufs_iupdat(tdp, I_SYNC); 3300 rw_exit(&tdp->i_contents); 3301 rw_exit(&ufsvfsp->vfs_dqrwlock); 3302 3303 rw_enter(&ip->i_rwlock, RW_WRITER); 3304 rw_enter(&ip->i_contents, RW_WRITER); 3305 TRANS_INODE(ufsvfsp, ip); 3306 ip->i_flag |= ICHG| IUPD; 3307 ip->i_seq++; 3308 ufs_iupdat(ip, I_SYNC); 3309 rw_exit(&ip->i_contents); 3310 rw_exit(&ip->i_rwlock); 3311 if (dorwlock == 2) 3312 rw_exit(&tdp->i_rwlock); 3313 if (ulp) { 3314 int terr = 0; 3315 3316 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3317 ufs_lockfs_end(ulp); 3318 if (err == 0) 3319 err = terr; 3320 } 3321 if (dorwlock == 1) 3322 rw_exit(&tdp->i_rwlock); 3323 *ipp = ip; 3324 return (err); 3325 3326 fail: 3327 rw_exit(&tdp->i_contents); 3328 rw_exit(&ufsvfsp->vfs_dqrwlock); 3329 if (dorwlock == 2) 3330 rw_exit(&tdp->i_rwlock); 3331 if (ulp) { 3332 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3333 ufs_lockfs_end(ulp); 3334 } 3335 if (dorwlock == 1) 3336 rw_exit(&tdp->i_rwlock); 3337 if (ip != NULL) 3338 VN_RELE(ITOV(ip)); 3339 3340 /* 3341 * No inodes? See if any are tied up in pending deletions. 3342 * This has to be done outside of any of the above, because 3343 * the draining operation can't be done from inside a transaction. 3344 */ 3345 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3346 ufs_delete_drain_wait(ufsvfsp, 1); 3347 retry = 0; 3348 goto again; 3349 } 3350 3351 return (err); 3352 } 3353 3354 /* 3355 * clear the dotdot directory entry. 3356 * Used by ufs_dirscan when clr_dotdot 3357 * flag is set and we're deleting a 3358 * directory. 3359 */ 3360 static int 3361 ufs_dirclrdotdot(struct inode *ip, ino_t parentino) 3362 { 3363 struct fbuf *fbp; 3364 struct direct *dotp, *dotdotp; 3365 int err = 0; 3366 3367 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 3368 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3369 err = blkatoff(ip, 0, NULL, &fbp); 3370 if (err) { 3371 return (err); 3372 } 3373 3374 dotp = (struct direct *)fbp->fb_addr; 3375 if ((dotp->d_namlen < (MAXNAMLEN + 1)) && 3376 ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) { 3377 dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen); 3378 if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) && 3379 ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) { 3380 3381 dotp->d_reclen += dotdotp->d_reclen; 3382 if (parentino == dotdotp->d_ino) { 3383 dotdotp->d_ino = 0; 3384 dotdotp->d_namlen = 0; 3385 dotdotp->d_reclen = 0; 3386 } 3387 3388 err = TRANS_DIR(ip, 0); 3389 if (err) { 3390 fbrelse(fbp, S_OTHER); 3391 } else { 3392 err = ufs_fbwrite(fbp, ip); 3393 } 3394 } 3395 } else { 3396 err = -1; 3397 } 3398 return (err); 3399 } 3400