1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 41 #pragma ident "%Z%%M% %I% %E% SMI" 42 43 /* 44 * Directory manipulation routines. 45 * 46 * When manipulating directories, the i_rwlock provides serialization 47 * since directories cannot be mmapped. The i_contents lock is redundant. 48 */ 49 50 #include <sys/types.h> 51 #include <sys/t_lock.h> 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/signal.h> 55 #include <sys/cred.h> 56 #include <sys/proc.h> 57 #include <sys/disp.h> 58 #include <sys/user.h> 59 #include <sys/vfs.h> 60 #include <sys/vnode.h> 61 #include <sys/stat.h> 62 #include <sys/mode.h> 63 #include <sys/buf.h> 64 #include <sys/uio.h> 65 #include <sys/dnlc.h> 66 #include <sys/fs/ufs_inode.h> 67 #include <sys/fs/ufs_fs.h> 68 #include <sys/mount.h> 69 #include <sys/fs/ufs_fsdir.h> 70 #include <sys/fs/ufs_trans.h> 71 #include <sys/fs/ufs_panic.h> 72 #include <sys/fs/ufs_quota.h> 73 #include <sys/errno.h> 74 #include <sys/debug.h> 75 #include <vm/seg.h> 76 #include <sys/sysmacros.h> 77 #include <sys/cmn_err.h> 78 #include <sys/cpuvar.h> 79 #include <sys/unistd.h> 80 #include <sys/policy.h> 81 82 /* 83 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ 84 */ 85 #if !ISP2(DIRBLKSIZ) 86 #error "DIRBLKSIZ not a power of 2" 87 #endif 88 89 /* 90 * A virgin directory. 91 */ 92 static struct dirtemplate mastertemplate = { 93 0, 12, 1, ".", 94 0, DIRBLKSIZ - 12, 2, ".." 95 }; 96 97 #define LDIRSIZ(len) \ 98 ((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3)) 99 #define MAX_DIR_NAME_LEN(len) \ 100 (((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1) 101 102 /* 103 * The dnlc directory cache allows a 64 bit handle for directory entries. 104 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset 105 * into the handle. Note, a 32 bit offset allows a 4GB directory, which 106 * is way beyond what could be cached in memory by the directory 107 * caching routines. So we are quite safe with this limit. 108 * The macros below pack and unpack the handle. 109 */ 110 #define H_TO_INO(h) (uint32_t)((h) & UINT_MAX) 111 #define H_TO_OFF(h) (off_t)((h) >> 32) 112 #define INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino)) 113 114 /* 115 * The average size of a typical on disk directory entry is about 16 bytes 116 * and so defines AV_DIRECT_SHIFT : log2(16) 117 * This define is only used to approximate the number of entries 118 * is a directory. This is needed for dnlc_dir_start() which will immediately 119 * return an error if the value is not within its acceptable range of 120 * number of files in a directory. 121 */ 122 #define AV_DIRECT_SHIFT 4 123 /* 124 * If the directory size (from i_size) is greater than the ufs_min_dir_cache 125 * tunable then we request dnlc directory caching. 126 * This has found to be profitable after 1024 file names. 127 */ 128 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT; 129 130 #ifdef DEBUG 131 int dirchk = 1; 132 #else /* !DEBUG */ 133 int dirchk = 0; 134 #endif /* DEBUG */ 135 int ufs_negative_cache = 1; 136 uint64_t ufs_dirremove_retry_cnt; 137 138 static void dirbad(); 139 static int ufs_dirrename(); 140 static int ufs_diraddentry(); 141 static int ufs_dirempty(); 142 static int ufs_dirscan(); 143 static int ufs_dirclrdotdot(); 144 static int ufs_dirfixdotdot(); 145 static int ufs_dirpurgedotdot(); 146 static int dirprepareentry(); 147 static int ufs_dirmakedirect(); 148 static int dirbadname(); 149 static int dirmangled(); 150 151 /* 152 * Look for a given name in a directory. On successful return, *ipp 153 * will point to the VN_HELD inode. 154 */ 155 int 156 ufs_dirlook( 157 struct inode *dp, 158 char *namep, 159 struct inode **ipp, 160 struct cred *cr, 161 int skipdnlc) /* skip the 1st level dnlc */ 162 { 163 uint64_t handle; 164 struct fbuf *fbp; /* a buffer of directory entries */ 165 struct direct *ep; /* the current directory entry */ 166 struct vnode *vp; 167 struct vnode *dvp; /* directory vnode ptr */ 168 dcanchor_t *dcap; 169 off_t endsearch; /* offset to end directory search */ 170 off_t offset; 171 off_t start_off; /* starting offset from middle search */ 172 off_t last_offset; /* last offset */ 173 int entryoffsetinblock; /* offset of ep in addr's buffer */ 174 int numdirpasses; /* strategy for directory search */ 175 int namlen; /* length of name */ 176 int err; 177 int doingchk; 178 int i; 179 int caching; 180 ino_t ep_ino; /* entry i number */ 181 ino_t chkino; 182 ushort_t ep_reclen; /* direct local d_reclen */ 183 184 ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */ 185 186 /* 187 * Check accessibility of directory. 188 */ 189 if (((dp->i_mode & IFMT) != IFDIR) && 190 ((dp->i_mode & IFMT) != IFATTRDIR)) 191 return (ENOTDIR); 192 193 if (err = ufs_iaccess(dp, IEXEC, cr)) 194 return (err); 195 196 /* 197 * Check the directory name lookup cache, first for individual files 198 * then for complete directories. 199 */ 200 dvp = ITOV(dp); 201 if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) { 202 /* vp is already held from dnlc_lookup */ 203 if (vp == DNLC_NO_VNODE) { 204 VN_RELE(vp); 205 return (ENOENT); 206 } 207 *ipp = VTOI(vp); 208 return (0); 209 } 210 211 dcap = &dp->i_danchor; 212 213 /* 214 * Grab the reader lock on the directory data before checking 215 * the dnlc to avoid a race with ufs_dirremove() & friends. 216 */ 217 rw_enter(&dp->i_rwlock, RW_READER); 218 219 switch (dnlc_dir_lookup(dcap, namep, &handle)) { 220 case DFOUND: 221 ep_ino = (ino_t)H_TO_INO(handle); 222 if (dp->i_number == ep_ino) { 223 VN_HOLD(dvp); /* want ourself, "." */ 224 *ipp = dp; 225 rw_exit(&dp->i_rwlock); 226 return (0); 227 } 228 if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) { 229 uint64_t handle2; 230 /* 231 * release the lock on the dir we are searching 232 * to avoid a deadlock when grabbing the 233 * i_contents lock in ufs_iget_alloced(). 234 */ 235 rw_exit(&dp->i_rwlock); 236 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 237 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 238 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 239 /* 240 * must recheck as we dropped dp->i_rwlock 241 */ 242 rw_enter(&dp->i_rwlock, RW_READER); 243 if (!err && (dnlc_dir_lookup(dcap, namep, &handle2) 244 == DFOUND) && (handle == handle2)) { 245 dnlc_update(dvp, namep, ITOV(*ipp)); 246 rw_exit(&dp->i_rwlock); 247 return (0); 248 } 249 /* check failed, read the actual directory */ 250 if (!err) { 251 VN_RELE(ITOV(*ipp)); 252 } 253 goto restart; 254 } 255 /* usual case of not "." nor ".." */ 256 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 257 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 258 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 259 if (err) { 260 rw_exit(&dp->i_rwlock); 261 return (err); 262 } 263 dnlc_update(dvp, namep, ITOV(*ipp)); 264 rw_exit(&dp->i_rwlock); 265 return (0); 266 case DNOENT: 267 if (ufs_negative_cache && (dp->i_nlink > 0)) { 268 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 269 } 270 rw_exit(&dp->i_rwlock); 271 return (ENOENT); 272 default: 273 break; 274 } 275 restart: 276 277 fbp = NULL; 278 doingchk = 0; 279 chkino = 0; 280 caching = 0; 281 282 /* 283 * Attempt to cache any directories greater than 284 * the tunable ufs_min_cache_dir. 285 */ 286 if ((dp->i_size >= ufs_min_dir_cache) && (dp->i_cachedir)) { 287 switch (dnlc_dir_start(dcap, dp->i_size >> AV_DIRECT_SHIFT)) { 288 case DNOMEM: 289 case DTOOBIG: 290 dp->i_cachedir = 0; 291 break; 292 case DOK: 293 caching = 1; 294 break; 295 default: 296 break; 297 } 298 } 299 /* 300 * If caching we don't stop when the file has been 301 * found, but need to know later, so clear *ipp now 302 */ 303 *ipp = NULL; 304 305 recheck: 306 if (caching) { 307 offset = 0; 308 entryoffsetinblock = 0; 309 numdirpasses = 1; 310 } else { 311 /* 312 * Take care to look at dp->i_diroff only once, as it 313 * may be changing due to other threads/cpus. 314 */ 315 offset = dp->i_diroff; 316 if (offset > dp->i_size) { 317 offset = 0; 318 } 319 if (offset == 0) { 320 entryoffsetinblock = 0; 321 numdirpasses = 1; 322 } else { 323 start_off = offset; 324 325 entryoffsetinblock = blkoff(dp->i_fs, offset); 326 if (entryoffsetinblock != 0) { 327 err = blkatoff(dp, offset, (char **)0, &fbp); 328 if (err) 329 goto bad; 330 } 331 numdirpasses = 2; 332 } 333 } 334 endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t); 335 namlen = strlen(namep); 336 last_offset = 0; 337 338 searchloop: 339 while (offset < endsearch) { 340 /* 341 * If offset is on a block boundary, 342 * read the next directory block. 343 * Release previous if it exists. 344 */ 345 if (blkoff(dp->i_fs, offset) == 0) { 346 if (fbp != NULL) { 347 fbrelse(fbp, S_OTHER); 348 } 349 err = blkatoff(dp, offset, (char **)0, &fbp); 350 if (err) 351 goto bad; 352 entryoffsetinblock = 0; 353 } 354 355 /* 356 * If the offset to the next entry is invalid or if the 357 * next entry is a zero length record or if the record 358 * length is invalid, then skip to the next directory 359 * block. Complete validation checks are done if the 360 * record length is invalid. 361 * 362 * Full validation checks are slow so they are disabled 363 * by default. Complete checks can be run by patching 364 * "dirchk" to be true. 365 * 366 * We have to check the validity of entryoffsetinblock 367 * here because it can be set to i_diroff above. 368 */ 369 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock); 370 if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 || 371 (dirchk || (ep->d_reclen & 0x3)) && 372 dirmangled(dp, ep, entryoffsetinblock, offset)) { 373 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 374 offset += i; 375 entryoffsetinblock += i; 376 if (caching) { 377 dnlc_dir_purge(dcap); 378 caching = 0; 379 } 380 continue; 381 } 382 383 ep_reclen = ep->d_reclen; 384 385 /* 386 * Add named entries and free space into the directory cache 387 */ 388 if (caching) { 389 ushort_t extra; 390 off_t off2; 391 392 if (ep->d_ino == 0) { 393 extra = ep_reclen; 394 if (offset & (DIRBLKSIZ - 1)) { 395 dnlc_dir_purge(dcap); 396 dp->i_cachedir = 0; 397 caching = 0; 398 } 399 } else { 400 /* 401 * entries hold the previous offset except the 402 * 1st which holds the offset + 1 403 */ 404 if (offset & (DIRBLKSIZ - 1)) { 405 off2 = last_offset; 406 } else { 407 off2 = offset + 1; 408 } 409 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 410 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 411 extra = ep_reclen - DIRSIZ(ep); 412 } 413 if (caching && (extra >= LDIRSIZ(1))) { 414 caching = (dnlc_dir_add_space(dcap, extra, 415 (uint64_t)offset) == DOK); 416 } 417 } 418 419 /* 420 * Check for a name match. 421 * We have the parent inode read locked with i_rwlock. 422 */ 423 if (ep->d_ino && ep->d_namlen == namlen && 424 *namep == *ep->d_name && /* fast chk 1st chr */ 425 bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) { 426 427 /* 428 * We have to release the fbp early here to avoid 429 * a possible deadlock situation where we have the 430 * fbp and want the directory inode and someone doing 431 * a ufs_direnter_* has the directory inode and wants 432 * the fbp. XXX - is this still needed? 433 */ 434 ep_ino = (ino_t)ep->d_ino; 435 ASSERT(fbp != NULL); 436 fbrelse(fbp, S_OTHER); 437 fbp = NULL; 438 439 /* 440 * Atomic update (read lock held) 441 */ 442 dp->i_diroff = offset; 443 444 if (namlen == 2 && namep[0] == '.' && namep[1] == '.') { 445 struct timeval32 omtime; 446 447 if (caching) { 448 dnlc_dir_purge(dcap); 449 caching = 0; 450 } 451 if (doingchk) { 452 /* 453 * if the inumber didn't change 454 * continue with already found inode. 455 */ 456 if (ep_ino == chkino) 457 goto checkok; 458 else { 459 VN_RELE(ITOV(*ipp)); 460 /* *ipp is nulled at restart */ 461 goto restart; 462 } 463 } 464 /* 465 * release the lock on the dir we are searching 466 * to avoid a deadlock when grabbing the 467 * i_contents lock in ufs_iget_alloced(). 468 */ 469 omtime = dp->i_mtime; 470 rw_exit(&dp->i_rwlock); 471 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 472 RW_READER); 473 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 474 cr); 475 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 476 rw_enter(&dp->i_rwlock, RW_READER); 477 if (err) 478 goto bad; 479 /* 480 * Since we released the lock on the directory, 481 * we must check that the same inode is still 482 * the ".." entry for this directory. 483 */ 484 /*CSTYLED*/ 485 if (timercmp(&omtime, &dp->i_mtime, !=)) { 486 /* 487 * Modification time changed on the 488 * directory, we must go check if 489 * the inumber changed for ".." 490 */ 491 doingchk = 1; 492 chkino = ep_ino; 493 entryoffsetinblock = 0; 494 if (caching) { 495 /* 496 * Forget directory caching 497 * for this rare case 498 */ 499 dnlc_dir_purge(dcap); 500 caching = 0; 501 } 502 goto recheck; 503 } 504 } else if (dp->i_number == ep_ino) { 505 VN_HOLD(dvp); /* want ourself, "." */ 506 *ipp = dp; 507 if (caching) { 508 dnlc_dir_purge(dcap); 509 caching = 0; 510 } 511 } else { 512 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 513 RW_READER); 514 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 515 cr); 516 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 517 if (err) 518 goto bad; 519 } 520 checkok: 521 ASSERT(*ipp); 522 dnlc_update(dvp, namep, ITOV(*ipp)); 523 /* 524 * If we are not caching then just return the entry 525 * otherwise complete loading up the cache 526 */ 527 if (!caching) { 528 rw_exit(&dp->i_rwlock); 529 return (0); 530 } 531 err = blkatoff(dp, offset, (char **)0, &fbp); 532 if (err) 533 goto bad; 534 } 535 last_offset = offset; 536 offset += ep_reclen; 537 entryoffsetinblock += ep_reclen; 538 } 539 /* 540 * If we started in the middle of the directory and failed 541 * to find our target, we must check the beginning as well. 542 */ 543 if (numdirpasses == 2) { 544 numdirpasses--; 545 offset = 0; 546 endsearch = start_off; 547 goto searchloop; 548 } 549 550 /* 551 * If whole directory caching is on (or was originally on) then 552 * the entry may have been found. 553 */ 554 if (*ipp == NULL) { 555 err = ENOENT; 556 if (ufs_negative_cache && (dp->i_nlink > 0)) { 557 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 558 } 559 } 560 if (caching) { 561 dnlc_dir_complete(dcap); 562 caching = 0; 563 } 564 565 bad: 566 if (err && *ipp) { 567 /* 568 * err and *ipp can both be set if we were attempting to 569 * cache the directory, and we found the entry, then later 570 * while trying to complete the directory cache encountered 571 * a error (eg reading a directory sector). 572 */ 573 VN_RELE(ITOV(*ipp)); 574 *ipp = NULL; 575 } 576 577 if (fbp) 578 fbrelse(fbp, S_OTHER); 579 rw_exit(&dp->i_rwlock); 580 if (caching) 581 dnlc_dir_purge(dcap); 582 return (err); 583 } 584 585 /* 586 * Write a new directory entry for DE_CREATE or DE_MKDIR operations. 587 */ 588 int 589 ufs_direnter_cm( 590 struct inode *tdp, /* target directory to make entry in */ 591 char *namep, /* name of entry */ 592 enum de_op op, /* entry operation */ 593 struct vattr *vap, /* attributes if new inode needed */ 594 struct inode **ipp, /* return entered inode here */ 595 struct cred *cr, /* user credentials */ 596 int flags) /* no entry exists */ 597 { 598 struct inode *tip; /* inode of (existing) target file */ 599 char *s; 600 struct slot slot; /* slot info to pass around */ 601 int namlen; /* length of name */ 602 int err; /* error number */ 603 struct inode *nip; /* new inode */ 604 int do_rele_nip = 0; /* release nip */ 605 int noentry = flags & ~IQUIET; 606 int quiet = flags & IQUIET; /* Suppress out of inodes message */ 607 608 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 609 610 if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) || 611 ((vap->va_type == VCHR) || (vap->va_type == VBLK) || 612 (vap->va_type == VDOOR) || (vap->va_type == VSOCK) || 613 (vap->va_type == VFIFO)))) 614 return (EINVAL); 615 616 /* don't allow '/' characters in pathname component */ 617 for (s = namep, namlen = 0; *s; s++, namlen++) 618 if (*s == '/') 619 return (EACCES); 620 ASSERT(namlen); 621 622 /* 623 * If name is "." or ".." then if this is a create look it up 624 * and return EEXIST. 625 */ 626 if (namep[0] == '.' && 627 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 628 /* 629 * ufs_dirlook will acquire the i_rwlock 630 */ 631 rw_exit(&tdp->i_rwlock); 632 if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) { 633 rw_enter(&tdp->i_rwlock, RW_WRITER); 634 return (err); 635 } 636 rw_enter(&tdp->i_rwlock, RW_WRITER); 637 return (EEXIST); 638 } 639 640 /* 641 * If target directory has not been removed, then we can consider 642 * allowing file to be created. 643 */ 644 if (tdp->i_nlink <= 0) { 645 return (ENOENT); 646 } 647 648 /* 649 * Check accessibility of directory. 650 */ 651 if (((tdp->i_mode & IFMT) != IFDIR) && 652 ((tdp->i_mode & IFMT) != IFATTRDIR)) { 653 return (ENOTDIR); 654 } 655 656 /* 657 * Execute access is required to search the directory. 658 */ 659 if (err = ufs_iaccess(tdp, IEXEC, cr)) { 660 return (err); 661 } 662 663 /* 664 * Search for the entry. Return VN_HELD tip if found. 665 */ 666 tip = NULL; 667 slot.fbp = NULL; 668 slot.status = NONE; 669 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 670 rw_enter(&tdp->i_contents, RW_WRITER); 671 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry); 672 if (err) 673 goto out; 674 if (tip) { 675 ASSERT(!noentry); 676 *ipp = tip; 677 err = EEXIST; 678 } else { 679 /* 680 * The entry does not exist. Check write permission in 681 * directory to see if entry can be created. 682 */ 683 if (err = ufs_iaccess(tdp, IWRITE, cr)) 684 goto out; 685 /* 686 * Make new inode and directory entry. 687 */ 688 tdp->i_flag |= quiet; 689 if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) { 690 if (nip != NULL) 691 do_rele_nip = 1; 692 goto out; 693 } 694 if (err = ufs_diraddentry(tdp, namep, op, 695 namlen, &slot, nip, NULL, cr)) { 696 /* 697 * Unmake the inode we just made. 698 */ 699 rw_enter(&nip->i_contents, RW_WRITER); 700 if (((nip->i_mode & IFMT) == IFDIR) || 701 ((nip->i_mode & IFMT) == IFATTRDIR)) { 702 tdp->i_nlink--; 703 ufs_setreclaim(tdp); 704 tdp->i_flag |= ICHG; 705 tdp->i_seq++; 706 TRANS_INODE(tdp->i_ufsvfs, tdp); 707 ITIMES_NOLOCK(tdp); 708 } 709 nip->i_nlink = 0; 710 ufs_setreclaim(nip); 711 TRANS_INODE(nip->i_ufsvfs, nip); 712 nip->i_flag |= ICHG; 713 nip->i_seq++; 714 ITIMES_NOLOCK(nip); 715 rw_exit(&nip->i_contents); 716 do_rele_nip = 1; 717 } else { 718 *ipp = nip; 719 } 720 } 721 722 out: 723 if (slot.fbp) 724 fbrelse(slot.fbp, S_OTHER); 725 726 tdp->i_flag &= ~quiet; 727 rw_exit(&tdp->i_contents); 728 729 /* 730 * Drop vfs_dqrwlock before calling VN_RELE() on nip to 731 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 732 */ 733 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 734 735 if (do_rele_nip) { 736 VN_RELE(ITOV(nip)); 737 } 738 739 return (err); 740 } 741 742 /* 743 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations. 744 * If tvpp is non-null, return with the pointer to the target vnode. 745 */ 746 int 747 ufs_direnter_lr( 748 struct inode *tdp, /* target directory to make entry in */ 749 char *namep, /* name of entry */ 750 enum de_op op, /* entry operation */ 751 struct inode *sdp, /* source inode parent if rename */ 752 struct inode *sip, /* source inode */ 753 struct cred *cr, /* user credentials */ 754 vnode_t **tvpp) /* Return: (held) vnode of (existing) target */ 755 { 756 struct inode *tip; /* inode of (existing) target file */ 757 char *s; 758 struct slot slot; /* slot info to pass around */ 759 int namlen; /* length of name */ 760 int err; /* error number */ 761 762 /* don't allow '/' characters in pathname component */ 763 for (s = namep, namlen = 0; *s; s++, namlen++) 764 if (*s == '/') 765 return (EACCES); 766 ASSERT(namlen); 767 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 768 769 /* 770 * If name is "." or ".." then if this is a create look it up 771 * and return EEXIST. Rename or link TO "." or ".." is forbidden. 772 */ 773 if (namep[0] == '.' && 774 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 775 if (op == DE_RENAME) { 776 return (EINVAL); /* *SIGH* should be ENOTEMPTY */ 777 } 778 return (EEXIST); 779 } 780 /* 781 * For link and rename lock the source entry and check the link count 782 * to see if it has been removed while it was unlocked. If not, we 783 * increment the link count and force the inode to disk to make sure 784 * that it is there before any directory entry that points to it. 785 * 786 * In the case of a symbolic link, we are dealing with a new inode 787 * which does not yet have any links. We've created it with a link 788 * count of 1, and we don't want to increment it since this will be 789 * its first link. 790 * 791 * We are about to push the inode to disk. We make sure 792 * that the inode's data blocks are flushed first so the 793 * inode and it's data blocks are always in sync. This 794 * adds some robustness in in the event of a power failure 795 * or panic where sync fails. If we panic before the 796 * inode is updated, then the inode still refers to the 797 * old data blocks (or none for a new file). If we panic 798 * after the inode is updated, then the inode refers to 799 * the new data blocks. 800 * 801 * We do this before grabbing the i_contents lock because 802 * ufs_syncip() will want that lock. We could do the data 803 * syncing after the removal checks, but upon return from 804 * the data sync we would have to repeat the removal 805 * checks. 806 */ 807 if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) { 808 return (err); 809 } 810 811 rw_enter(&sip->i_contents, RW_WRITER); 812 if (sip->i_nlink <= 0) { 813 rw_exit(&sip->i_contents); 814 return (ENOENT); 815 } 816 if (sip->i_nlink == MAXLINK) { 817 rw_exit(&sip->i_contents); 818 return (EMLINK); 819 } 820 821 /* 822 * Sync the indirect blocks associated with the file 823 * for the same reasons as described above. Since this 824 * call wants the i_contents lock held for it we can do 825 * this here with no extra work. 826 */ 827 if (err = ufs_sync_indir(sip)) { 828 rw_exit(&sip->i_contents); 829 return (err); 830 } 831 832 if (op != DE_SYMLINK) 833 sip->i_nlink++; 834 TRANS_INODE(sip->i_ufsvfs, sip); 835 sip->i_flag |= ICHG; 836 sip->i_seq++; 837 ufs_iupdat(sip, I_SYNC); 838 rw_exit(&sip->i_contents); 839 840 /* 841 * If target directory has not been removed, then we can consider 842 * allowing file to be created. 843 */ 844 if (tdp->i_nlink <= 0) { 845 err = ENOENT; 846 goto out2; 847 } 848 /* 849 * Check accessibility of directory. 850 */ 851 if (((tdp->i_mode & IFMT) != IFDIR) && 852 (tdp->i_mode & IFMT) != IFATTRDIR) { 853 err = ENOTDIR; 854 goto out2; 855 } 856 /* 857 * Execute access is required to search the directory. 858 */ 859 if (err = ufs_iaccess(tdp, IEXEC, cr)) { 860 goto out2; 861 } 862 863 /* 864 * Search for the entry. Return VN_HELD tip if found. 865 */ 866 tip = NULL; 867 slot.status = NONE; 868 slot.fbp = NULL; 869 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 870 rw_enter(&tdp->i_contents, RW_WRITER); 871 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0); 872 if (err) 873 goto out; 874 875 if (tip) { 876 switch (op) { 877 case DE_RENAME: 878 err = ufs_dirrename(sdp, sip, tdp, namep, 879 tip, &slot, cr); 880 break; 881 882 case DE_LINK: 883 case DE_SYMLINK: 884 /* 885 * Can't link to an existing file. 886 */ 887 err = EEXIST; 888 break; 889 default: 890 break; 891 } 892 } else { 893 /* 894 * The entry does not exist. Check write permission in 895 * directory to see if entry can be created. 896 */ 897 if (err = ufs_iaccess(tdp, IWRITE, cr)) 898 goto out; 899 err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp, 900 cr); 901 } 902 903 out: 904 if (slot.fbp) 905 fbrelse(slot.fbp, S_OTHER); 906 907 rw_exit(&tdp->i_contents); 908 909 /* 910 * Drop vfs_dqrwlock before calling VN_RELE() on tip to 911 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 912 */ 913 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 914 915 /* 916 * If we renamed a file over the top of an existing file, 917 * or linked a file to an existing file (or tried to), 918 * then set *tvpp to the target vnode, if tvpp is non-null 919 * otherwise, release and delete (or just release) the inode. 920 * 921 * N.B., by returning the target's vnode pointer to the caller, 922 * that caller becomes responsible for doing the VN_RELE. 923 */ 924 if (tip) { 925 if ((err == 0) && (tvpp != NULL)) { 926 *tvpp = ITOV(tip); 927 } else { 928 VN_RELE(ITOV(tip)); 929 } 930 } 931 932 out2: 933 if (err) { 934 /* 935 * Undo bumped link count. 936 */ 937 if (op != DE_SYMLINK) { 938 rw_enter(&sip->i_contents, RW_WRITER); 939 sip->i_nlink--; 940 ufs_setreclaim(sip); 941 TRANS_INODE(sip->i_ufsvfs, sip); 942 sip->i_flag |= ICHG; 943 sip->i_seq++; 944 ITIMES_NOLOCK(sip); 945 rw_exit(&sip->i_contents); 946 } 947 } 948 return (err); 949 } 950 951 /* 952 * Check for the existence of a name in a directory (unless noentry 953 * is set) , or else of an empty 954 * slot in which an entry may be made. If the requested name is found, 955 * then on return *ipp points at the inode and *offp contains 956 * its offset in the directory. If the name is not found, then *ipp 957 * will be NULL and *slotp will contain information about a directory slot in 958 * which an entry may be made (either an empty slot, or the first position 959 * past the end of the directory). 960 * The target directory inode (tdp) is supplied write locked (i_rwlock). 961 * 962 * This may not be used on "." or "..", but aliases of "." are ok. 963 */ 964 int 965 ufs_dircheckforname( 966 struct inode *tdp, /* inode of directory being checked */ 967 char *namep, /* name we're checking for */ 968 int namlen, /* length of name, excluding null */ 969 struct slot *slotp, /* slot structure */ 970 struct inode **ipp, /* return inode if we find one */ 971 struct cred *cr, 972 int noentry) /* noentry - just look for space */ 973 { 974 uint64_t handle; 975 struct fbuf *fbp; /* pointer to directory block */ 976 struct direct *ep; /* directory entry */ 977 struct direct *nep; /* next directory entry */ 978 dcanchor_t *dcap; 979 vnode_t *dvp; /* directory vnode ptr */ 980 off_t dirsize; /* size of the directory */ 981 off_t offset; /* offset in the directory */ 982 off_t last_offset; /* last offset */ 983 off_t enduseful; /* pointer past last used dir slot */ 984 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 985 int i; /* length of mangled entry */ 986 int needed; 987 int err; 988 int first; 989 int caching; 990 int stat; 991 ino_t ep_ino; 992 slotstat_t initstat = slotp->status; 993 994 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 995 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 996 ASSERT(*ipp == NULL); 997 fbp = NULL; 998 999 /* 1000 * First check if there is a complete cache of the directory. 1001 */ 1002 dvp = ITOV(tdp); 1003 1004 dcap = &tdp->i_danchor; 1005 if (noentry) { 1006 /* 1007 * We know from the 1st level dnlc cache that the entry 1008 * doesn't exist, so don't bother searching the directory 1009 * cache, but just look for space (possibly in the directory 1010 * cache). 1011 */ 1012 stat = DNOENT; 1013 } else { 1014 stat = dnlc_dir_lookup(dcap, namep, &handle); 1015 } 1016 switch (stat) { 1017 case DFOUND: 1018 ep_ino = (ino_t)H_TO_INO(handle); 1019 if (tdp->i_number == ep_ino) { 1020 *ipp = tdp; /* we want ourself, ie "." */ 1021 VN_HOLD(dvp); 1022 } else { 1023 err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr); 1024 if (err) 1025 return (err); 1026 } 1027 offset = H_TO_OFF(handle); 1028 first = 0; 1029 if (offset & 1) { 1030 /* This is the first entry in the block */ 1031 first = 1; 1032 offset -= 1; 1033 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1034 } 1035 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1036 if (err) { 1037 VN_RELE(ITOV(*ipp)); 1038 *ipp = NULL; 1039 return (err); 1040 } 1041 /* 1042 * Check the validity of the entry. 1043 * If it's bad, then throw away the cache and 1044 * continue without it. The dirmangled() routine 1045 * will then be called upon it. 1046 */ 1047 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1048 VN_RELE(ITOV(*ipp)); 1049 *ipp = NULL; 1050 dnlc_dir_purge(dcap); 1051 break; 1052 } 1053 /* 1054 * Remember the returned offset is the offset of the 1055 * preceding record (unless this is the 1st record 1056 * in the DIRBLKSIZ sized block (disk sector)), then it's 1057 * offset + 1. Note, no real offsets are on odd boundaries. 1058 */ 1059 if (first) { 1060 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1061 slotp->offset = offset; 1062 slotp->size = 0; 1063 slotp->ep = ep; 1064 } else { 1065 /* get the next entry */ 1066 nep = (struct direct *)((char *)ep + ep->d_reclen); 1067 /* 1068 * Check the validity of this entry as well 1069 * If it's bad, then throw away the cache and 1070 * continue without it. The dirmangled() routine 1071 * will then be called upon it. 1072 */ 1073 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1074 (nep->d_ino != ep_ino)) { 1075 VN_RELE(ITOV(*ipp)); 1076 *ipp = NULL; 1077 dnlc_dir_purge(dcap); 1078 break; 1079 } 1080 slotp->offset = offset + ep->d_reclen; 1081 slotp->size = ep->d_reclen; 1082 slotp->ep = nep; 1083 } 1084 slotp->status = EXIST; 1085 slotp->fbp = fbp; 1086 slotp->endoff = 0; 1087 slotp->cached = 1; 1088 dnlc_update(dvp, namep, ITOV(*ipp)); 1089 return (0); 1090 case DNOENT: 1091 /* 1092 * The caller gets to set the initial slot status to 1093 * indicate whether it's interested in getting a 1094 * empty slot. For example, the status can be set 1095 * to FOUND when an entry is being deleted. 1096 */ 1097 ASSERT(slotp->fbp == NULL); 1098 if (slotp->status == FOUND) { 1099 return (0); 1100 } 1101 switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen), 1102 &handle)) { 1103 case DFOUND: 1104 offset = (off_t)handle; 1105 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1106 if (err) { 1107 dnlc_dir_purge(dcap); 1108 ASSERT(*ipp == NULL); 1109 return (err); 1110 } 1111 /* 1112 * Check the validity of the entry. 1113 * If it's bad, then throw away the cache and 1114 * continue without it. The dirmangled() routine 1115 * will then be called upon it. 1116 */ 1117 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1118 dnlc_dir_purge(dcap); 1119 break; 1120 } 1121 /* 1122 * Remember the returned offset is the offset of the 1123 * containing record. 1124 */ 1125 slotp->status = FOUND; 1126 slotp->ep = ep; 1127 slotp->offset = offset; 1128 slotp->fbp = fbp; 1129 slotp->size = ep->d_reclen; 1130 /* 1131 * Set end offset to 0. Truncation is handled 1132 * because the dnlc cache will blow away the 1133 * cached directory when an entry is removed 1134 * that drops the entries left to less than half 1135 * the minumum number (dnlc_min_dir_cache). 1136 */ 1137 slotp->endoff = 0; 1138 slotp->cached = 1; 1139 return (0); 1140 case DNOENT: 1141 slotp->status = NONE; 1142 slotp->offset = P2ROUNDUP_TYPED(tdp->i_size, 1143 DIRBLKSIZ, u_offset_t); 1144 slotp->size = DIRBLKSIZ; 1145 slotp->endoff = 0; 1146 slotp->cached = 1; 1147 return (0); 1148 default: 1149 break; 1150 } 1151 break; 1152 } 1153 slotp->cached = 0; 1154 caching = NULL; 1155 if (tdp->i_cachedir && !noentry) { 1156 /* 1157 * Attempt to cache any directories greater than 1158 * the tunable ufs_min_cache_dir. 1159 */ 1160 if (tdp->i_size >= ufs_min_dir_cache) { 1161 switch (dnlc_dir_start(dcap, 1162 tdp->i_size >> AV_DIRECT_SHIFT)) { 1163 case DNOMEM: 1164 case DTOOBIG: 1165 tdp->i_cachedir = 0; 1166 break; 1167 case DOK: 1168 caching = 1; 1169 break; 1170 default: 1171 break; 1172 } 1173 } 1174 } 1175 1176 /* 1177 * No point in using i_diroff since we must search whole directory 1178 */ 1179 dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t); 1180 enduseful = 0; 1181 offset = last_offset = 0; 1182 entryoffsetinblk = 0; 1183 needed = (int)LDIRSIZ(namlen); 1184 while (offset < dirsize) { 1185 /* 1186 * If offset is on a block boundary, 1187 * read the next directory block. 1188 * Release previous if it exists. 1189 */ 1190 if (blkoff(tdp->i_fs, offset) == 0) { 1191 if (fbp != NULL) 1192 fbrelse(fbp, S_OTHER); 1193 1194 err = blkatoff(tdp, offset, (char **)0, &fbp); 1195 if (err) { 1196 ASSERT(*ipp == NULL); 1197 if (caching) { 1198 dnlc_dir_purge(dcap); 1199 } 1200 return (err); 1201 } 1202 entryoffsetinblk = 0; 1203 } 1204 /* 1205 * If still looking for a slot, and at a DIRBLKSIZ 1206 * boundary, have to start looking for free space 1207 * again. 1208 */ 1209 if (slotp->status == NONE && 1210 (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) { 1211 slotp->offset = -1; 1212 } 1213 /* 1214 * If the next entry is a zero length record or if the 1215 * record length is invalid, then skip to the next 1216 * directory block. Complete validation checks are 1217 * done if the record length is invalid. 1218 * 1219 * Full validation checks are slow so they are disabled 1220 * by default. Complete checks can be run by patching 1221 * "dirchk" to be true. 1222 * 1223 * We do not have to check the validity of 1224 * entryoffsetinblk here because it starts out as zero 1225 * and is only incremented by d_reclen values that we 1226 * validate here. 1227 */ 1228 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1229 if (ep->d_reclen == 0 || 1230 (dirchk || (ep->d_reclen & 0x3)) && 1231 dirmangled(tdp, ep, entryoffsetinblk, offset)) { 1232 i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1)); 1233 offset += i; 1234 entryoffsetinblk += i; 1235 if (caching) { 1236 dnlc_dir_purge(dcap); 1237 caching = 0; 1238 } 1239 continue; 1240 } 1241 1242 /* 1243 * Add named entries and free space into the directory cache 1244 */ 1245 if (caching) { 1246 ushort_t extra; 1247 off_t off2; 1248 1249 if (ep->d_ino == 0) { 1250 extra = ep->d_reclen; 1251 if (offset & (DIRBLKSIZ - 1)) { 1252 dnlc_dir_purge(dcap); 1253 caching = 0; 1254 } 1255 } else { 1256 /* 1257 * entries hold the previous offset if 1258 * not the 1st one 1259 */ 1260 if (offset & (DIRBLKSIZ - 1)) { 1261 off2 = last_offset; 1262 } else { 1263 off2 = offset + 1; 1264 } 1265 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 1266 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 1267 extra = ep->d_reclen - DIRSIZ(ep); 1268 } 1269 if (caching && (extra >= LDIRSIZ(1))) { 1270 caching = (dnlc_dir_add_space(dcap, extra, 1271 (uint64_t)offset) == DOK); 1272 } 1273 } 1274 1275 /* 1276 * If an appropriate sized slot has not yet been found, 1277 * check to see if one is available. 1278 */ 1279 if ((slotp->status != FOUND) && (slotp->status != EXIST)) { 1280 int size = ep->d_reclen; 1281 1282 if (ep->d_ino != 0) 1283 size -= DIRSIZ(ep); 1284 if (size > 0) { 1285 if (size >= needed) { 1286 slotp->offset = offset; 1287 slotp->size = ep->d_reclen; 1288 if (noentry) { 1289 slotp->ep = ep; 1290 slotp->fbp = fbp; 1291 slotp->status = FOUND; 1292 slotp->endoff = 0; 1293 return (0); 1294 } 1295 slotp->status = FOUND; 1296 } else if (slotp->status == NONE) { 1297 if (slotp->offset == -1) 1298 slotp->offset = offset; 1299 } 1300 } 1301 } 1302 /* 1303 * Check for a name match. 1304 */ 1305 if (ep->d_ino && ep->d_namlen == namlen && 1306 *namep == *ep->d_name && /* fast chk 1st char */ 1307 bcmp(namep, ep->d_name, namlen) == 0) { 1308 1309 tdp->i_diroff = offset; 1310 1311 if (tdp->i_number == ep->d_ino) { 1312 *ipp = tdp; /* we want ourself, ie "." */ 1313 VN_HOLD(dvp); 1314 } else { 1315 err = ufs_iget_alloced(tdp->i_vfs, 1316 (ino_t)ep->d_ino, ipp, cr); 1317 if (err) { 1318 fbrelse(fbp, S_OTHER); 1319 if (caching) 1320 dnlc_dir_purge(dcap); 1321 return (err); 1322 } 1323 } 1324 slotp->status = EXIST; 1325 slotp->offset = offset; 1326 slotp->size = (int)(offset - last_offset); 1327 slotp->fbp = fbp; 1328 slotp->ep = ep; 1329 slotp->endoff = 0; 1330 if (caching) 1331 dnlc_dir_purge(dcap); 1332 return (0); 1333 } 1334 last_offset = offset; 1335 offset += ep->d_reclen; 1336 entryoffsetinblk += ep->d_reclen; 1337 if (ep->d_ino) 1338 enduseful = offset; 1339 } 1340 if (fbp) { 1341 fbrelse(fbp, S_OTHER); 1342 } 1343 1344 if (caching) { 1345 dnlc_dir_complete(dcap); 1346 slotp->cached = 1; 1347 if (slotp->status == FOUND) { 1348 if (initstat == FOUND) { 1349 return (0); 1350 } 1351 (void) dnlc_dir_rem_space_by_handle(dcap, 1352 slotp->offset); 1353 slotp->endoff = 0; 1354 return (0); 1355 } 1356 } 1357 1358 if (slotp->status == NONE) { 1359 /* 1360 * We didn't find a slot; the new directory entry should be put 1361 * at the end of the directory. Return an indication of where 1362 * this is, and set "endoff" to zero; since we're going to have 1363 * to extend the directory, we're certainly not going to 1364 * truncate it. 1365 */ 1366 slotp->offset = dirsize; 1367 slotp->size = DIRBLKSIZ; 1368 slotp->endoff = 0; 1369 } else { 1370 /* 1371 * We found a slot, and will return an indication of where that 1372 * slot is, as any new directory entry will be put there. 1373 * Since that slot will become a useful entry, if the last 1374 * useful entry we found was before this one, update the offset 1375 * of the last useful entry. 1376 */ 1377 if (enduseful < slotp->offset + slotp->size) 1378 enduseful = slotp->offset + slotp->size; 1379 slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t); 1380 } 1381 *ipp = NULL; 1382 return (0); 1383 } 1384 1385 uint64_t ufs_dirrename_retry_cnt; 1386 1387 /* 1388 * Rename the entry in the directory tdp so that it points to 1389 * sip instead of tip. 1390 */ 1391 static int 1392 ufs_dirrename( 1393 struct inode *sdp, /* parent directory of source */ 1394 struct inode *sip, /* source inode */ 1395 struct inode *tdp, /* parent directory of target */ 1396 char *namep, /* entry we are trying to change */ 1397 struct inode *tip, /* target inode */ 1398 struct slot *slotp, /* slot for entry */ 1399 struct cred *cr) /* credentials */ 1400 { 1401 vnode_t *tdvp; 1402 off_t offset; 1403 int err; 1404 int doingdirectory; 1405 1406 ASSERT(sdp->i_ufsvfs != NULL); 1407 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1408 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1409 /* 1410 * Short circuit rename of something to itself. 1411 */ 1412 if (sip->i_number == tip->i_number) { 1413 return (ESAME); /* special KLUDGE error code */ 1414 } 1415 1416 /* 1417 * We're locking 2 peer level locks, so must use tryenter 1418 * on the 2nd to avoid deadlocks that would occur 1419 * if we renamed a->b and b->a concurrently. 1420 */ 1421 retry: 1422 rw_enter(&tip->i_contents, RW_WRITER); 1423 if (!rw_tryenter(&sip->i_contents, RW_READER)) { 1424 /* 1425 * drop tip and wait (sleep) until we stand a chance 1426 * of holding sip 1427 */ 1428 rw_exit(&tip->i_contents); 1429 rw_enter(&sip->i_contents, RW_READER); 1430 /* 1431 * Reverse the lock grabs in case we have heavy 1432 * contention on the 2nd lock. 1433 */ 1434 if (!rw_tryenter(&tip->i_contents, RW_WRITER)) { 1435 ufs_dirrename_retry_cnt++; 1436 rw_exit(&sip->i_contents); 1437 goto retry; 1438 } 1439 } 1440 1441 /* 1442 * Check that everything is on the same filesystem. 1443 */ 1444 if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) || 1445 (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) { 1446 err = EXDEV; /* XXX archaic */ 1447 goto out; 1448 } 1449 /* 1450 * Must have write permission to rewrite target entry. 1451 * Perform additional checks for sticky directories. 1452 */ 1453 if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 || 1454 (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0) 1455 goto out; 1456 1457 /* 1458 * Ensure source and target are compatible (both directories 1459 * or both not directories). If target is a directory it must 1460 * be empty and have no links to it; in addition it must not 1461 * be a mount point, and both the source and target must be 1462 * writable. 1463 */ 1464 doingdirectory = (((sip->i_mode & IFMT) == IFDIR) || 1465 ((sip->i_mode & IFMT) == IFATTRDIR)); 1466 if (((tip->i_mode & IFMT) == IFDIR) || 1467 ((tip->i_mode & IFMT) == IFATTRDIR)) { 1468 if (!doingdirectory) { 1469 err = EISDIR; 1470 goto out; 1471 } 1472 /* 1473 * vn_vfswlock will prevent mounts from using the directory 1474 * until we are done. 1475 */ 1476 if (vn_vfswlock(ITOV(tip))) { 1477 err = EBUSY; 1478 goto out; 1479 } 1480 if (vn_mountedvfs(ITOV(tip)) != NULL) { 1481 vn_vfsunlock(ITOV(tip)); 1482 err = EBUSY; 1483 goto out; 1484 } 1485 if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) { 1486 vn_vfsunlock(ITOV(tip)); 1487 err = EEXIST; /* SIGH should be ENOTEMPTY */ 1488 goto out; 1489 } 1490 } else if (doingdirectory) { 1491 err = ENOTDIR; 1492 goto out; 1493 } 1494 1495 /* 1496 * Rewrite the inode pointer for target name entry 1497 * from the target inode (ip) to the source inode (sip). 1498 * This prevents the target entry from disappearing 1499 * during a crash. Mark the directory inode to reflect the changes. 1500 */ 1501 tdvp = ITOV(tdp); 1502 slotp->ep->d_ino = (int32_t)sip->i_number; 1503 dnlc_update(tdvp, namep, ITOV(sip)); 1504 if (slotp->size) { 1505 offset = slotp->offset - slotp->size; 1506 } else { 1507 offset = slotp->offset + 1; 1508 } 1509 if (slotp->cached) { 1510 (void) dnlc_dir_update(&tdp->i_danchor, namep, 1511 INO_OFF_TO_H(slotp->ep->d_ino, offset)); 1512 } 1513 1514 err = TRANS_DIR(tdp, slotp->offset); 1515 if (err) 1516 fbrelse(slotp->fbp, S_OTHER); 1517 else 1518 err = ufs_fbwrite(slotp->fbp, tdp); 1519 1520 slotp->fbp = NULL; 1521 if (err) { 1522 if (doingdirectory) 1523 vn_vfsunlock(ITOV(tip)); 1524 goto out; 1525 } 1526 1527 TRANS_INODE(tdp->i_ufsvfs, tdp); 1528 tdp->i_flag |= IUPD|ICHG; 1529 tdp->i_seq++; 1530 ITIMES_NOLOCK(tdp); 1531 1532 /* 1533 * Decrement the link count of the target inode. 1534 * Fix the ".." entry in sip to point to dp. 1535 * This is done after the new entry is on the disk. 1536 */ 1537 tip->i_nlink--; 1538 TRANS_INODE(tip->i_ufsvfs, tip); 1539 tip->i_flag |= ICHG; 1540 tip->i_seq++; 1541 ITIMES_NOLOCK(tip); 1542 if (doingdirectory) { 1543 /* 1544 * The entry for tip no longer exists so I can unlock the 1545 * vfslock. 1546 */ 1547 vn_vfsunlock(ITOV(tip)); 1548 /* 1549 * Decrement target link count once more if it was a directory. 1550 */ 1551 if (--tip->i_nlink != 0) { 1552 err = ufs_fault(ITOV(tip), 1553 "ufs_dirrename: target directory link count != 0 (%s)", 1554 tip->i_fs->fs_fsmnt); 1555 rw_exit(&tip->i_contents); 1556 return (err); 1557 } 1558 TRANS_INODE(tip->i_ufsvfs, tip); 1559 ufs_setreclaim(tip); 1560 /* 1561 * Renaming a directory with the parent different 1562 * requires that ".." be rewritten. The window is 1563 * still there for ".." to be inconsistent, but this 1564 * is unavoidable, and a lot shorter than when it was 1565 * done in a user process. We decrement the link 1566 * count in the new parent as appropriate to reflect 1567 * the just-removed target. If the parent is the 1568 * same, this is appropriate since the original 1569 * directory is going away. If the new parent is 1570 * different, ufs_dirfixdotdot() will bump the link count 1571 * back. 1572 */ 1573 tdp->i_nlink--; 1574 ufs_setreclaim(tdp); 1575 TRANS_INODE(tdp->i_ufsvfs, tdp); 1576 tdp->i_flag |= ICHG; 1577 tdp->i_seq++; 1578 ITIMES_NOLOCK(tdp); 1579 if (sdp != tdp) { 1580 rw_exit(&tip->i_contents); 1581 rw_exit(&sip->i_contents); 1582 err = ufs_dirfixdotdot(sip, sdp, tdp); 1583 return (err); 1584 } 1585 } else 1586 ufs_setreclaim(tip); 1587 out: 1588 rw_exit(&tip->i_contents); 1589 rw_exit(&sip->i_contents); 1590 return (err); 1591 } 1592 1593 /* 1594 * Fix the ".." entry of the child directory so that it points 1595 * to the new parent directory instead of the old one. Routine 1596 * assumes that dp is a directory and that all the inodes are on 1597 * the same file system. 1598 */ 1599 static int 1600 ufs_dirfixdotdot( 1601 struct inode *dp, /* child directory */ 1602 struct inode *opdp, /* old parent directory */ 1603 struct inode *npdp) /* new parent directory */ 1604 { 1605 struct fbuf *fbp; 1606 struct dirtemplate *dirp; 1607 vnode_t *dvp; 1608 int err; 1609 1610 ASSERT(RW_WRITE_HELD(&npdp->i_rwlock)); 1611 ASSERT(RW_WRITE_HELD(&npdp->i_contents)); 1612 1613 /* 1614 * We hold the child directory's i_contents lock before calling 1615 * blkatoff so that we honor correct locking protocol which is 1616 * i_contents lock and then page lock. (blkatoff will call 1617 * ufs_getpage where we want the page lock) 1618 * We hold the child directory's i_rwlock before i_contents (as 1619 * per the locking protocol) since we are modifying the ".." entry 1620 * of the child directory. 1621 * We hold the i_rwlock and i_contents lock until we record 1622 * this directory delta to the log (via ufs_trans_dir) and have 1623 * done fbrelse. 1624 */ 1625 rw_enter(&dp->i_rwlock, RW_WRITER); 1626 rw_enter(&dp->i_contents, RW_WRITER); 1627 err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp); 1628 if (err) 1629 goto bad; 1630 1631 if (dp->i_nlink <= 0 || 1632 dp->i_size < sizeof (struct dirtemplate)) { 1633 err = ENOENT; 1634 goto bad; 1635 } 1636 1637 if (dirp->dotdot_namlen != 2 || 1638 dirp->dotdot_name[0] != '.' || 1639 dirp->dotdot_name[1] != '.') { /* Sanity check. */ 1640 dirbad(dp, "mangled .. entry", (off_t)0); 1641 err = ENOTDIR; 1642 goto bad; 1643 } 1644 1645 /* 1646 * Increment the link count in the new parent inode and force it out. 1647 */ 1648 if (npdp->i_nlink == MAXLINK) { 1649 err = EMLINK; 1650 goto bad; 1651 } 1652 npdp->i_nlink++; 1653 TRANS_INODE(npdp->i_ufsvfs, npdp); 1654 npdp->i_flag |= ICHG; 1655 npdp->i_seq++; 1656 ufs_iupdat(npdp, I_SYNC); 1657 1658 /* 1659 * Rewrite the child ".." entry and force it out. 1660 */ 1661 dvp = ITOV(dp); 1662 dirp->dotdot_ino = (uint32_t)npdp->i_number; 1663 dnlc_update(dvp, "..", ITOV(npdp)); 1664 (void) dnlc_dir_update(&dp->i_danchor, "..", 1665 INO_OFF_TO_H(dirp->dotdot_ino, 0)); 1666 1667 err = TRANS_DIR(dp, 0); 1668 if (err) 1669 fbrelse(fbp, S_OTHER); 1670 else 1671 err = ufs_fbwrite(fbp, dp); 1672 1673 fbp = NULL; 1674 if (err) 1675 goto bad; 1676 1677 rw_exit(&dp->i_contents); 1678 rw_exit(&dp->i_rwlock); 1679 1680 /* 1681 * Decrement the link count of the old parent inode and force it out. 1682 */ 1683 ASSERT(opdp); 1684 rw_enter(&opdp->i_contents, RW_WRITER); 1685 ASSERT(opdp->i_nlink > 0); 1686 opdp->i_nlink--; 1687 ufs_setreclaim(opdp); 1688 TRANS_INODE(opdp->i_ufsvfs, opdp); 1689 opdp->i_flag |= ICHG; 1690 opdp->i_seq++; 1691 ufs_iupdat(opdp, I_SYNC); 1692 rw_exit(&opdp->i_contents); 1693 return (0); 1694 1695 bad: 1696 if (fbp) 1697 fbrelse(fbp, S_OTHER); 1698 rw_exit(&dp->i_contents); 1699 rw_exit(&dp->i_rwlock); 1700 return (err); 1701 } 1702 1703 /* 1704 * Enter the file sip in the directory tdp with name namep. 1705 */ 1706 static int 1707 ufs_diraddentry( 1708 struct inode *tdp, 1709 char *namep, 1710 enum de_op op, 1711 int namlen, 1712 struct slot *slotp, 1713 struct inode *sip, 1714 struct inode *sdp, 1715 struct cred *cr) 1716 { 1717 struct direct *ep, *nep; 1718 vnode_t *tdvp; 1719 dcanchor_t *dcap = &tdp->i_danchor; 1720 off_t offset; 1721 int err; 1722 ushort_t extra; 1723 1724 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1725 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1726 /* 1727 * Prepare a new entry. If the caller has not supplied an 1728 * existing inode, make a new one. 1729 */ 1730 err = dirprepareentry(tdp, slotp, cr); 1731 if (err) { 1732 if (slotp->fbp) { 1733 fbrelse(slotp->fbp, S_OTHER); 1734 slotp->fbp = NULL; 1735 } 1736 return (err); 1737 } 1738 /* 1739 * Check inode to be linked to see if it is in the 1740 * same filesystem. 1741 */ 1742 if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) { 1743 err = EXDEV; 1744 goto bad; 1745 } 1746 1747 /* 1748 * If renaming a directory then fix up the ".." entry in the 1749 * directory to point to the new parent. 1750 */ 1751 if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) || 1752 ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) { 1753 err = ufs_dirfixdotdot(sip, sdp, tdp); 1754 if (err) 1755 goto bad; 1756 } 1757 1758 /* 1759 * Fill in entry data. 1760 */ 1761 ep = slotp->ep; 1762 ep->d_namlen = (ushort_t)namlen; 1763 (void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3)); 1764 ep->d_ino = (uint32_t)sip->i_number; 1765 tdvp = ITOV(tdp); 1766 dnlc_update(tdvp, namep, ITOV(sip)); 1767 /* 1768 * Note the offset supplied for any named entry is 1769 * the offset of the previous one, unless it's the 1st. 1770 * slotp->size is used to pass the length to 1771 * the previous entry. 1772 */ 1773 if (slotp->size) { 1774 offset = slotp->offset - slotp->size; 1775 } else { 1776 offset = slotp->offset + 1; 1777 } 1778 1779 if (slotp->cached) { 1780 /* 1781 * Add back any usable unused space to the dnlc directory 1782 * cache. 1783 */ 1784 extra = ep->d_reclen - DIRSIZ(ep); 1785 if (extra >= LDIRSIZ(1)) { 1786 (void) dnlc_dir_add_space(dcap, extra, 1787 (uint64_t)slotp->offset); 1788 } 1789 1790 (void) dnlc_dir_add_entry(dcap, namep, 1791 INO_OFF_TO_H(ep->d_ino, offset)); 1792 1793 /* adjust the previous offset of the next entry */ 1794 nep = (struct direct *)((char *)ep + ep->d_reclen); 1795 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 1796 /* 1797 * Not a new block. 1798 * 1799 * Check the validity of the next entry. 1800 * If it's bad, then throw away the cache, and 1801 * continue as before directory caching. 1802 */ 1803 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1804 dnlc_dir_update(dcap, nep->d_name, 1805 INO_OFF_TO_H(nep->d_ino, slotp->offset)) 1806 == DNOENT) { 1807 dnlc_dir_purge(dcap); 1808 slotp->cached = 0; 1809 } 1810 } 1811 } 1812 1813 /* 1814 * Write out the directory block. 1815 */ 1816 err = TRANS_DIR(tdp, slotp->offset); 1817 if (err) 1818 fbrelse(slotp->fbp, S_OTHER); 1819 else 1820 err = ufs_fbwrite(slotp->fbp, tdp); 1821 1822 slotp->fbp = NULL; 1823 /* 1824 * If this is a rename of a directory, then we have already 1825 * fixed the ".." entry to refer to the new parent. If err 1826 * is true at this point, we have failed to update the new 1827 * parent to refer to the renamed directory. 1828 * XXX - we need to unwind the ".." fix. 1829 */ 1830 if (err) 1831 return (err); 1832 1833 /* 1834 * Mark the directory inode to reflect the changes. 1835 * Truncate the directory to chop off blocks of empty entries. 1836 */ 1837 1838 TRANS_INODE(tdp->i_ufsvfs, tdp); 1839 tdp->i_flag |= IUPD|ICHG; 1840 tdp->i_seq++; 1841 tdp->i_diroff = 0; 1842 ITIMES_NOLOCK(tdp); 1843 /* 1844 * If the directory grew then dirprepareentry() will have 1845 * set IATTCHG in tdp->i_flag, then the directory inode must 1846 * be flushed out. This is because if fsync() is used later 1847 * the directory size must be correct, otherwise a crash would 1848 * cause fsck to move the file to lost+found. Also because later 1849 * a file may be linked in more than one directory, then there 1850 * is no way to flush the original directory. So it must be 1851 * flushed out on creation. See bug 4293809. 1852 */ 1853 if (tdp->i_flag & IATTCHG) { 1854 ufs_iupdat(tdp, I_SYNC); 1855 } 1856 1857 if (slotp->endoff && (slotp->endoff < tdp->i_size)) { 1858 if (!TRANS_ISTRANS(tdp->i_ufsvfs)) { 1859 (void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0, 1860 cr); 1861 } 1862 } 1863 1864 1865 return (0); 1866 1867 bad: 1868 if (slotp->cached) { 1869 dnlc_dir_purge(dcap); 1870 fbrelse(slotp->fbp, S_OTHER); 1871 slotp->cached = 0; 1872 slotp->fbp = NULL; 1873 return (err); 1874 } 1875 1876 /* 1877 * Clear out entry prepared by dirprepareent. 1878 */ 1879 slotp->ep->d_ino = 0; 1880 slotp->ep->d_namlen = 0; 1881 1882 /* 1883 * Don't touch err so we don't clobber the real error that got us here. 1884 */ 1885 if (TRANS_DIR(tdp, slotp->offset)) 1886 fbrelse(slotp->fbp, S_OTHER); 1887 else 1888 (void) ufs_fbwrite(slotp->fbp, tdp); 1889 slotp->fbp = NULL; 1890 return (err); 1891 } 1892 1893 /* 1894 * Prepare a directory slot to receive an entry. 1895 */ 1896 static int 1897 dirprepareentry( 1898 struct inode *dp, /* directory we are working in */ 1899 struct slot *slotp, /* available slot info */ 1900 struct cred *cr) 1901 { 1902 struct direct *ep, *nep; 1903 off_t entryend; 1904 int err; 1905 slotstat_t status = slotp->status; 1906 ushort_t dsize; 1907 1908 ASSERT((status == NONE) || (status == FOUND)); 1909 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 1910 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 1911 /* 1912 * If we didn't find a slot, then indicate that the 1913 * new slot belongs at the end of the directory. 1914 * If we found a slot, then the new entry can be 1915 * put at slotp->offset. 1916 */ 1917 entryend = slotp->offset + slotp->size; 1918 if (status == NONE) { 1919 ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0); 1920 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 1921 err = ufs_fault(ITOV(dp), 1922 "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d" 1923 " > dp->i_fs->fs_fsize: %d (%s)", 1924 DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt); 1925 return (err); 1926 } 1927 /* 1928 * Allocate the new block. 1929 */ 1930 err = BMAPALLOC(dp, (u_offset_t)slotp->offset, 1931 (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr); 1932 if (err) { 1933 return (err); 1934 } 1935 dp->i_size = entryend; 1936 TRANS_INODE(dp->i_ufsvfs, dp); 1937 dp->i_flag |= IUPD|ICHG|IATTCHG; 1938 dp->i_seq++; 1939 ITIMES_NOLOCK(dp); 1940 } else if (entryend > dp->i_size) { 1941 /* 1942 * Adjust directory size, if needed. This should never 1943 * push the size past a new multiple of DIRBLKSIZ. 1944 * This is an artifact of the old (4.2BSD) way of initializing 1945 * directory sizes to be less than DIRBLKSIZ. 1946 */ 1947 dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t); 1948 TRANS_INODE(dp->i_ufsvfs, dp); 1949 dp->i_flag |= IUPD|ICHG|IATTCHG; 1950 dp->i_seq++; 1951 ITIMES_NOLOCK(dp); 1952 } 1953 1954 /* 1955 * Get the block containing the space for the new directory entry. 1956 */ 1957 if (slotp->fbp == NULL) { 1958 err = blkatoff(dp, slotp->offset, (char **)&slotp->ep, 1959 &slotp->fbp); 1960 if (err) { 1961 return (err); 1962 } 1963 } 1964 ep = slotp->ep; 1965 1966 switch (status) { 1967 case NONE: 1968 /* 1969 * No space in the directory. slotp->offset will be on a 1970 * directory block boundary and we will write the new entry 1971 * into a fresh block. 1972 */ 1973 ep->d_reclen = DIRBLKSIZ; 1974 slotp->size = 0; /* length of previous entry */ 1975 break; 1976 case FOUND: 1977 /* 1978 * An entry of the required size has been found. Use it. 1979 */ 1980 if (ep->d_ino == 0) { 1981 /* this is the 1st record in a block */ 1982 slotp->size = 0; /* length of previous entry */ 1983 } else { 1984 dsize = DIRSIZ(ep); 1985 nep = (struct direct *)((char *)ep + dsize); 1986 nep->d_reclen = ep->d_reclen - dsize; 1987 ep->d_reclen = dsize; 1988 slotp->ep = nep; 1989 slotp->offset += dsize; 1990 slotp->size = dsize; /* length of previous entry */ 1991 } 1992 break; 1993 default: 1994 break; 1995 } 1996 return (0); 1997 } 1998 1999 /* 2000 * Allocate and initialize a new inode that will go into directory tdp. 2001 * This routine is called from ufs_symlink(), as well as within this file. 2002 */ 2003 int 2004 ufs_dirmakeinode( 2005 struct inode *tdp, 2006 struct inode **ipp, 2007 struct vattr *vap, 2008 enum de_op op, 2009 struct cred *cr) 2010 { 2011 struct inode *ip; 2012 enum vtype type; 2013 int imode; /* mode and format as in inode */ 2014 ino_t ipref; 2015 int err; 2016 timestruc_t now; 2017 2018 ASSERT(vap != NULL); 2019 ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR || 2020 op == DE_SYMLINK); 2021 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 2022 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 2023 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 2024 /* 2025 * Allocate a new inode. 2026 */ 2027 type = vap->va_type; 2028 if (type == VDIR) { 2029 ipref = dirpref(tdp); 2030 } else { 2031 ipref = tdp->i_number; 2032 } 2033 if (op == DE_ATTRDIR) 2034 imode = vap->va_mode; 2035 else 2036 imode = MAKEIMODE(type, vap->va_mode); 2037 *ipp = NULL; 2038 err = ufs_ialloc(tdp, ipref, imode, &ip, cr); 2039 if (err) 2040 return (err); 2041 2042 /* 2043 * We don't need to grab vfs_dqrwlock here because it is held 2044 * in ufs_direnter_*() above us. 2045 */ 2046 ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock)); 2047 rw_enter(&ip->i_contents, RW_WRITER); 2048 if (ip->i_dquot != NULL) { 2049 err = ufs_fault(ITOV(ip), 2050 "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)", 2051 tdp->i_fs->fs_fsmnt); 2052 rw_exit(&ip->i_contents); 2053 return (err); 2054 } 2055 *ipp = ip; 2056 ip->i_mode = (o_mode_t)imode; 2057 if (type == VBLK || type == VCHR) { 2058 dev_t d = vap->va_rdev; 2059 dev32_t dev32; 2060 2061 /* 2062 * Don't allow a special file to be created with a 2063 * dev_t that cannot be represented by this filesystem 2064 * format on disk. 2065 */ 2066 if (!cmpldev(&dev32, d)) { 2067 err = EOVERFLOW; 2068 goto fail; 2069 } 2070 2071 ITOV(ip)->v_rdev = ip->i_rdev = d; 2072 2073 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 2074 ip->i_ordev = dev32; /* can't use old format */ 2075 } else { 2076 ip->i_ordev = cmpdev(d); 2077 } 2078 } 2079 ITOV(ip)->v_type = type; 2080 ufs_reset_vnode(ip->i_vnode); 2081 if (type == VDIR) { 2082 ip->i_nlink = 2; /* anticipating a call to dirmakedirect */ 2083 } else { 2084 ip->i_nlink = 1; 2085 } 2086 2087 if (op == DE_ATTRDIR) { 2088 ip->i_uid = vap->va_uid; 2089 ip->i_gid = vap->va_gid; 2090 } else 2091 ip->i_uid = crgetuid(cr); 2092 /* 2093 * To determine the group-id of the created file: 2094 * 1) If the gid is set in the attribute list (non-Sun & pre-4.0 2095 * clients are not likely to set the gid), then use it if 2096 * the process is privileged, belongs to the target group, 2097 * or the group is the same as the parent directory. 2098 * 2) If the filesystem was not mounted with the Old-BSD-compatible 2099 * GRPID option, and the directory's set-gid bit is clear, 2100 * then use the process's gid. 2101 * 3) Otherwise, set the group-id to the gid of the parent directory. 2102 */ 2103 if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) && 2104 ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) || 2105 secpolicy_vnode_create_gid(cr) == 0)) { 2106 /* 2107 * XXX - is this only the case when a 4.0 NFS client, or a 2108 * client derived from that code, makes a call over the wire? 2109 */ 2110 ip->i_gid = vap->va_gid; 2111 } else 2112 ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr); 2113 2114 /* 2115 * For SunOS 5.0->5.4, the lines below read: 2116 * 2117 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 2118 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 2119 * 2120 * where MAXUID was set to 60002. See notes on this in ufs_inode.c 2121 */ 2122 ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 2123 UID_LONG : ip->i_uid; 2124 ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 2125 GID_LONG : ip->i_gid; 2126 2127 /* 2128 * If we're creating a directory, and the parent directory has the 2129 * set-GID bit set, set it on the new directory. 2130 * Otherwise, if the user is neither privileged nor a member of the 2131 * file's new group, clear the file's set-GID bit. 2132 */ 2133 if ((tdp->i_mode & ISGID) && (type == VDIR)) 2134 ip->i_mode |= ISGID; 2135 else { 2136 if ((ip->i_mode & ISGID) && 2137 secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0) 2138 ip->i_mode &= ~ISGID; 2139 } 2140 2141 if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2142 ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2143 err = EOVERFLOW; 2144 goto fail; 2145 } 2146 2147 /* 2148 * Extended attribute directories are not subject to quotas. 2149 */ 2150 if (op != DE_ATTRDIR) 2151 ip->i_dquot = getinoquota(ip); 2152 else 2153 ip->i_dquot = NULL; 2154 2155 if (op == DE_MKDIR || op == DE_ATTRDIR) { 2156 err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr); 2157 if (err) 2158 goto fail; 2159 } 2160 2161 /* 2162 * generate the shadow inode and attach it to the new object 2163 */ 2164 ASSERT((tdp->i_shadow && tdp->i_ufs_acl) || 2165 (!tdp->i_shadow && !tdp->i_ufs_acl)); 2166 if (tdp->i_shadow && tdp->i_ufs_acl && 2167 (((tdp->i_mode & IFMT) == IFDIR) || 2168 ((tdp->i_mode & IFMT) == IFATTRDIR))) { 2169 err = ufs_si_inherit(ip, tdp, ip->i_mode, cr); 2170 if (err) { 2171 if (op == DE_MKDIR) { 2172 /* 2173 * clean up parent directory 2174 * 2175 * tdp->i_contents already locked from 2176 * ufs_direnter_*() 2177 */ 2178 tdp->i_nlink--; 2179 TRANS_INODE(tdp->i_ufsvfs, tdp); 2180 tdp->i_flag |= ICHG; 2181 tdp->i_seq++; 2182 ufs_iupdat(tdp, I_SYNC); 2183 } 2184 goto fail; 2185 } 2186 } 2187 2188 /* 2189 * If the passed in attributes contain atime and/or mtime 2190 * settings, then use them instead of using the current 2191 * high resolution time. 2192 */ 2193 if (vap->va_mask & (AT_MTIME|AT_ATIME)) { 2194 if (vap->va_mask & AT_ATIME) { 2195 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 2196 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2197 ip->i_flag &= ~IACC; 2198 } else 2199 ip->i_flag |= IACC; 2200 if (vap->va_mask & AT_MTIME) { 2201 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 2202 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2203 gethrestime(&now); 2204 if (now.tv_sec > TIME32_MAX) { 2205 /* 2206 * In 2038, ctime sticks forever.. 2207 */ 2208 ip->i_ctime.tv_sec = TIME32_MAX; 2209 ip->i_ctime.tv_usec = 0; 2210 } else { 2211 ip->i_ctime.tv_sec = now.tv_sec; 2212 ip->i_ctime.tv_usec = now.tv_nsec / 1000; 2213 } 2214 ip->i_flag &= ~(IUPD|ICHG); 2215 ip->i_flag |= IMODTIME; 2216 } else 2217 ip->i_flag |= IUPD|ICHG; 2218 ip->i_flag |= IMOD; 2219 } else 2220 ip->i_flag |= IACC|IUPD|ICHG; 2221 ip->i_seq++; 2222 2223 /* 2224 * If this is an attribute tag it as one. 2225 */ 2226 if ((tdp->i_mode & IFMT) == IFATTRDIR) { 2227 ip->i_cflags |= IXATTR; 2228 } 2229 2230 /* 2231 * push inode before it's name appears in a directory 2232 */ 2233 TRANS_INODE(ip->i_ufsvfs, ip); 2234 ufs_iupdat(ip, I_SYNC); 2235 rw_exit(&ip->i_contents); 2236 return (0); 2237 2238 fail: 2239 /* Throw away inode we just allocated. */ 2240 ip->i_nlink = 0; 2241 ufs_setreclaim(ip); 2242 TRANS_INODE(ip->i_ufsvfs, ip); 2243 ip->i_flag |= ICHG; 2244 ip->i_seq++; 2245 ITIMES_NOLOCK(ip); 2246 rw_exit(&ip->i_contents); 2247 return (err); 2248 } 2249 2250 /* 2251 * Write a prototype directory into the empty inode ip, whose parent is dp. 2252 */ 2253 static int 2254 ufs_dirmakedirect( 2255 struct inode *ip, /* new directory */ 2256 struct inode *dp, /* parent directory */ 2257 int attrdir, 2258 struct cred *cr) 2259 { 2260 struct dirtemplate *dirp; 2261 struct fbuf *fbp; 2262 int err; 2263 2264 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 2265 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2266 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2267 /* 2268 * Allocate space for the directory we're creating. 2269 */ 2270 err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr); 2271 if (err) 2272 return (err); 2273 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 2274 err = ufs_fault(ITOV(dp), 2275 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)", 2276 DIRBLKSIZ, dp->i_fs->fs_fsize, 2277 dp->i_fs->fs_fsmnt); 2278 return (err); 2279 } 2280 ip->i_size = DIRBLKSIZ; 2281 TRANS_INODE(ip->i_ufsvfs, ip); 2282 ip->i_flag |= IUPD|ICHG|IATTCHG; 2283 ip->i_seq++; 2284 ITIMES_NOLOCK(ip); 2285 /* 2286 * Update the tdp link count and write out the change. 2287 * This reflects the ".." entry we'll soon write. 2288 */ 2289 if (dp->i_nlink == MAXLINK) 2290 return (EMLINK); 2291 if (attrdir == 0) 2292 dp->i_nlink++; 2293 TRANS_INODE(dp->i_ufsvfs, dp); 2294 dp->i_flag |= ICHG; 2295 dp->i_seq++; 2296 ufs_iupdat(dp, I_SYNC); 2297 /* 2298 * Initialize directory with "." 2299 * and ".." from static template. 2300 * 2301 * Since the parent directory is locked, we don't have to 2302 * worry about anything changing when we drop the write 2303 * lock on (ip). 2304 * 2305 */ 2306 err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize, 2307 S_READ, &fbp); 2308 2309 if (err) { 2310 goto fail; 2311 } 2312 dirp = (struct dirtemplate *)fbp->fb_addr; 2313 /* 2314 * Now initialize the directory we're creating 2315 * with the "." and ".." entries. 2316 */ 2317 *dirp = mastertemplate; /* structure assignment */ 2318 dirp->dot_ino = (uint32_t)ip->i_number; 2319 dirp->dotdot_ino = (uint32_t)dp->i_number; 2320 2321 err = TRANS_DIR(ip, 0); 2322 if (err) { 2323 fbrelse(fbp, S_OTHER); 2324 goto fail; 2325 } 2326 2327 err = ufs_fbwrite(fbp, ip); 2328 if (err) { 2329 goto fail; 2330 } 2331 2332 return (0); 2333 2334 fail: 2335 if (attrdir == 0) 2336 dp->i_nlink--; 2337 TRANS_INODE(dp->i_ufsvfs, dp); 2338 dp->i_flag |= ICHG; 2339 dp->i_seq++; 2340 ufs_iupdat(dp, I_SYNC); 2341 return (err); 2342 } 2343 2344 /* 2345 * Delete a directory entry. If oip is nonzero the entry is checked 2346 * to make sure it still reflects oip. 2347 * 2348 * If vpp is non-null, return the ptr of the (held) vnode associated with 2349 * the removed name. The caller is responsible for doing the VN_RELE(). 2350 */ 2351 int 2352 ufs_dirremove( 2353 struct inode *dp, 2354 char *namep, 2355 struct inode *oip, 2356 struct vnode *cdir, 2357 enum dr_op op, 2358 struct cred *cr, 2359 vnode_t **vpp) /* Return (held) vnode ptr of removed file/dir */ 2360 { 2361 struct direct *ep, *pep, *nep; 2362 struct inode *ip; 2363 vnode_t *dvp, *vp; 2364 struct slot slot; 2365 int namlen; 2366 int err; 2367 int mode; 2368 ushort_t extra; 2369 2370 namlen = (int)strlen(namep); 2371 if (namlen == 0) 2372 return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0")); 2373 /* 2374 * return error when removing . and .. 2375 */ 2376 if (namep[0] == '.') { 2377 if (namlen == 1) 2378 return (EINVAL); 2379 else if (namlen == 2 && namep[1] == '.') { 2380 return (EEXIST); /* SIGH should be ENOTEMPTY */ 2381 } 2382 } 2383 2384 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2385 /* 2386 * Check accessibility of directory. 2387 */ 2388 retry: 2389 if (((dp->i_mode & IFMT) != IFDIR) && 2390 ((dp->i_mode & IFMT) != IFATTRDIR)) { 2391 return (ENOTDIR); 2392 } 2393 2394 /* 2395 * Execute access is required to search the directory. 2396 * Access for write is interpreted as allowing 2397 * deletion of files in the directory. 2398 */ 2399 if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) { 2400 return (err); 2401 } 2402 2403 ip = NULL; 2404 slot.fbp = NULL; 2405 slot.status = FOUND; /* don't need to look for empty slot */ 2406 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 2407 rw_enter(&dp->i_contents, RW_WRITER); 2408 err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0); 2409 if (err) 2410 goto out_novfs; 2411 if (ip == NULL) { 2412 err = ENOENT; 2413 goto out_novfs; 2414 } 2415 vp = ITOV(ip); 2416 if (oip && oip != ip) { 2417 err = ENOENT; 2418 goto out_novfs; 2419 } 2420 2421 mode = ip->i_mode & IFMT; 2422 if (mode == IFDIR || mode == IFATTRDIR) { 2423 2424 /* 2425 * vn_vfswlock() prevents races between mount and rmdir. 2426 */ 2427 if (vn_vfswlock(vp)) { 2428 err = EBUSY; 2429 goto out_novfs; 2430 } 2431 if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) { 2432 err = EBUSY; 2433 goto out; 2434 } 2435 /* 2436 * If we are removing a directory, get a lock on it. 2437 * Taking a writer lock prevents a parallel ufs_dirlook from 2438 * incorrectly entering a negative cache vnode entry in the dnlc 2439 * If the directory is empty, it will stay empty until 2440 * we can remove it. 2441 */ 2442 if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) { 2443 /* 2444 * It is possible that a thread in rename would have 2445 * acquired this rwlock. To prevent a deadlock we 2446 * do a rw_tryenter. If we fail to get the lock 2447 * we drop all the locks we have acquired, wait 2448 * for 2 ticks and reacquire the 2449 * directory's (dp) i_rwlock and try again. 2450 * If we dont drop dp's i_rwlock then we will panic 2451 * with a "Deadlock: cycle in blocking chain" 2452 * since in ufs_dircheckpath we want dp's i_rwlock. 2453 * dp is guaranteed to exist since ufs_dirremove is 2454 * called after a VN_HOLD(dp) has been done. 2455 */ 2456 ufs_dirremove_retry_cnt++; 2457 vn_vfsunlock(vp); 2458 if (slot.fbp) 2459 fbrelse(slot.fbp, S_OTHER); 2460 rw_exit(&dp->i_contents); 2461 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2462 rw_exit(&dp->i_rwlock); 2463 VN_RELE(vp); 2464 delay(2); 2465 rw_enter(&dp->i_rwlock, RW_WRITER); 2466 goto retry; 2467 } 2468 } 2469 rw_enter(&ip->i_contents, RW_READER); 2470 2471 /* 2472 * Now check the restrictions that apply on sticky directories. 2473 */ 2474 if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) { 2475 rw_exit(&ip->i_contents); 2476 if (mode == IFDIR || mode == IFATTRDIR) 2477 rw_exit(&ip->i_rwlock); 2478 goto out; 2479 } 2480 2481 if (op == DR_RMDIR) { 2482 /* 2483 * For rmdir(2), some special checks are required. 2484 * (a) Don't remove any alias of the parent (e.g. "."). 2485 * (b) Don't remove the current directory. 2486 * (c) Make sure the entry is (still) a directory. 2487 * (d) Make sure the directory is empty. 2488 */ 2489 2490 if (dp == ip || vp == cdir) 2491 err = EINVAL; 2492 else if (((ip->i_mode & IFMT) != IFDIR) && 2493 ((ip->i_mode & IFMT) != IFATTRDIR)) 2494 err = ENOTDIR; 2495 else if ((ip->i_nlink > 2) || 2496 !ufs_dirempty(ip, dp->i_number, cr)) { 2497 err = EEXIST; /* SIGH should be ENOTEMPTY */ 2498 } 2499 2500 if (err) { 2501 rw_exit(&ip->i_contents); 2502 if (mode == IFDIR || mode == IFATTRDIR) 2503 rw_exit(&ip->i_rwlock); 2504 goto out; 2505 } 2506 } else if (op == DR_REMOVE) { 2507 /* 2508 * unlink(2) requires a different check: allow only 2509 * privileged users to unlink a directory. 2510 */ 2511 if (vp->v_type == VDIR && 2512 secpolicy_fs_linkdir(cr, vp->v_vfsp)) { 2513 err = EPERM; 2514 rw_exit(&ip->i_contents); 2515 rw_exit(&ip->i_rwlock); 2516 goto out; 2517 } 2518 } 2519 2520 rw_exit(&ip->i_contents); 2521 2522 /* 2523 * Remove the cache'd entry, if any. 2524 */ 2525 dvp = ITOV(dp); 2526 dnlc_remove(dvp, namep); 2527 ep = slot.ep; 2528 ep->d_ino = 0; 2529 2530 if (slot.cached) { 2531 dcanchor_t *dcap = &dp->i_danchor; 2532 2533 (void) dnlc_dir_rem_entry(dcap, namep, NULL); 2534 if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) { 2535 (void) dnlc_dir_rem_space_by_handle(dcap, slot.offset); 2536 } 2537 if (slot.offset & (DIRBLKSIZ - 1)) { 2538 /* 2539 * Collapse new free space into previous entry. 2540 * Note, the previous entry has already been 2541 * validated in ufs_dircheckforname(). 2542 */ 2543 ASSERT(slot.size); 2544 pep = (struct direct *)((char *)ep - slot.size); 2545 if ((pep->d_ino == 0) && 2546 ((uintptr_t)pep & (DIRBLKSIZ - 1))) { 2547 dnlc_dir_purge(dcap); 2548 slot.cached = 0; 2549 goto nocache; 2550 } 2551 if (pep->d_ino) { 2552 extra = pep->d_reclen - DIRSIZ(pep); 2553 } else { 2554 extra = pep->d_reclen; 2555 } 2556 if (extra >= LDIRSIZ(1)) { 2557 (void) dnlc_dir_rem_space_by_handle(dcap, 2558 (uint64_t)(slot.offset - slot.size)); 2559 } 2560 pep->d_reclen += ep->d_reclen; 2561 (void) dnlc_dir_add_space(dcap, extra + ep->d_reclen, 2562 (uint64_t)(slot.offset - slot.size)); 2563 /* adjust the previous pointer in the next entry */ 2564 nep = (struct direct *)((char *)ep + ep->d_reclen); 2565 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 2566 /* 2567 * Not a new block. 2568 * 2569 * Check the validity of the entry. 2570 * If it's bad, then throw away the cache and 2571 * continue. 2572 */ 2573 if ((nep->d_reclen == 0) || 2574 (nep->d_reclen & 0x3) || 2575 (dnlc_dir_update(dcap, nep->d_name, 2576 INO_OFF_TO_H(nep->d_ino, 2577 slot.offset - slot.size)) == DNOENT)) { 2578 dnlc_dir_purge(dcap); 2579 slot.cached = 0; 2580 } 2581 } 2582 } else { 2583 (void) dnlc_dir_add_space(dcap, ep->d_reclen, 2584 (uint64_t)slot.offset); 2585 } 2586 } else { 2587 /* 2588 * If the entry isn't the first in the directory, we must 2589 * reclaim the space of the now empty record by adding 2590 * the record size to the size of the previous entry. 2591 */ 2592 if (slot.offset & (DIRBLKSIZ - 1)) { 2593 /* 2594 * Collapse new free space into previous entry. 2595 */ 2596 pep = (struct direct *)((char *)ep - slot.size); 2597 pep->d_reclen += ep->d_reclen; 2598 } 2599 } 2600 nocache: 2601 2602 2603 err = TRANS_DIR(dp, slot.offset); 2604 if (err) 2605 fbrelse(slot.fbp, S_OTHER); 2606 else 2607 err = ufs_fbwrite(slot.fbp, dp); 2608 slot.fbp = NULL; 2609 2610 /* 2611 * If we were removing a directory, it is 'gone' now, but we cannot 2612 * unlock it as a thread may be waiting for the lock in ufs_create. If 2613 * we did, it could then create a file in a deleted directory. 2614 */ 2615 2616 if (err) { 2617 if (mode == IFDIR || mode == IFATTRDIR) 2618 rw_exit(&ip->i_rwlock); 2619 goto out; 2620 } 2621 2622 rw_enter(&ip->i_contents, RW_WRITER); 2623 2624 dp->i_flag |= IUPD|ICHG; 2625 dp->i_seq++; 2626 ip->i_flag |= ICHG; 2627 ip->i_seq++; 2628 2629 TRANS_INODE(dp->i_ufsvfs, dp); 2630 TRANS_INODE(ip->i_ufsvfs, ip); 2631 /* 2632 * Now dispose of the inode. 2633 */ 2634 if (ip->i_nlink > 0) { 2635 /* 2636 * This is not done for IFATTRDIR's because they don't 2637 * have entries in the dnlc and the link counts are 2638 * not incremented when they are created. 2639 */ 2640 if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) { 2641 /* 2642 * Decrement by 2 because we're trashing the "." 2643 * entry as well as removing the entry in dp. 2644 * Clear the directory entry, but there may be 2645 * other hard links so don't free the inode. 2646 * Decrement the dp linkcount because we're 2647 * trashing the ".." entry. 2648 */ 2649 ip->i_nlink -= 2; 2650 dp->i_nlink--; 2651 ufs_setreclaim(dp); 2652 /* 2653 * XXX need to discard negative cache entries 2654 * for vp. See comment in ufs_delete(). 2655 */ 2656 dnlc_remove(vp, "."); 2657 dnlc_remove(vp, ".."); 2658 /* 2659 * The return value is ignored here bacause if 2660 * the directory purge fails we don't want to 2661 * stop the delete. If ufs_dirpurgedotdot fails 2662 * the delete will continue with the preexiting 2663 * behavior. 2664 */ 2665 (void) ufs_dirpurgedotdot(ip, dp->i_number, cr); 2666 } else { 2667 ip->i_nlink--; 2668 } 2669 ufs_setreclaim(ip); 2670 } 2671 ITIMES_NOLOCK(dp); 2672 ITIMES_NOLOCK(ip); 2673 2674 if (!TRANS_ISTRANS(dp->i_ufsvfs)) 2675 ufs_iupdat(dp, I_SYNC); 2676 if (!TRANS_ISTRANS(ip->i_ufsvfs)) 2677 ufs_iupdat(ip, I_SYNC); 2678 2679 rw_exit(&ip->i_contents); 2680 if (mode == IFDIR || mode == IFATTRDIR) 2681 rw_exit(&ip->i_rwlock); 2682 out: 2683 if (mode == IFDIR || mode == IFATTRDIR) { 2684 vn_vfsunlock(vp); 2685 } 2686 out_novfs: 2687 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2688 2689 if (slot.fbp) 2690 fbrelse(slot.fbp, S_OTHER); 2691 2692 rw_exit(&dp->i_contents); 2693 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2694 2695 /* 2696 * If no error and vpp is non-NULL, return the vnode ptr to the caller. 2697 * The caller becomes responsible for the VN_RELE(). Otherwise, 2698 * Release (and delete) the inode after we drop vfs_dqrwlock to 2699 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 2700 */ 2701 if (ip) { 2702 if ((err == 0) && (vpp != NULL)) { 2703 *vpp = ITOV(ip); 2704 } else { 2705 VN_RELE(vp); 2706 } 2707 } 2708 2709 return (err); 2710 } 2711 2712 /* 2713 * Return buffer with contents of block "offset" 2714 * from the beginning of directory "ip". If "res" 2715 * is non-zero, fill it in with a pointer to the 2716 * remaining space in the directory. 2717 * 2718 */ 2719 2720 int 2721 blkatoff( 2722 struct inode *ip, 2723 off_t offset, 2724 char **res, 2725 struct fbuf **fbpp) 2726 { 2727 struct fs *fs; 2728 struct fbuf *fbp; 2729 daddr_t lbn; 2730 uint_t bsize; 2731 int err; 2732 2733 CPU_STATS_ADD_K(sys, ufsdirblk, 1); 2734 fs = ip->i_fs; 2735 lbn = (daddr_t)lblkno(fs, offset); 2736 bsize = (uint_t)blksize(fs, ip, lbn); 2737 err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask), 2738 bsize, S_READ, &fbp); 2739 if (err) { 2740 *fbpp = (struct fbuf *)NULL; 2741 return (err); 2742 } 2743 if (res) 2744 *res = fbp->fb_addr + blkoff(fs, offset); 2745 *fbpp = fbp; 2746 return (0); 2747 } 2748 2749 /* 2750 * Do consistency checking: 2751 * record length must be multiple of 4 2752 * entry must fit in rest of its DIRBLKSIZ block 2753 * record must be large enough to contain entry 2754 * name is not longer than MAXNAMLEN 2755 * name must be as long as advertised, and null terminated 2756 * NOTE: record length must not be zero (should be checked previously). 2757 * This routine is only called if dirchk is true. 2758 * It would be nice to set the FSBAD flag in the super-block when 2759 * this routine fails so that a fsck is forced on next reboot, 2760 * but locking is a problem. 2761 */ 2762 static int 2763 dirmangled( 2764 struct inode *dp, 2765 struct direct *ep, 2766 int entryoffsetinblock, 2767 off_t offset) 2768 { 2769 int i; 2770 2771 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 2772 if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i || 2773 (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN || 2774 ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) { 2775 dirbad(dp, "mangled entry", offset); 2776 return (1); 2777 } 2778 return (0); 2779 } 2780 2781 static void 2782 dirbad(struct inode *ip, char *how, off_t offset) 2783 { 2784 cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s", 2785 ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how); 2786 } 2787 2788 static int 2789 dirbadname(char *sp, int l) 2790 { 2791 while (l--) { /* check for nulls */ 2792 if (*sp++ == '\0') { 2793 return (1); 2794 } 2795 } 2796 return (*sp); /* check for terminating null */ 2797 } 2798 2799 /* 2800 * Check if a directory is empty or not. 2801 */ 2802 static int 2803 ufs_dirempty( 2804 struct inode *ip, 2805 ino_t parentino, 2806 struct cred *cr) 2807 { 2808 return (ufs_dirscan(ip, parentino, cr, 0)); 2809 } 2810 2811 /* 2812 * clear the .. directory entry. 2813 */ 2814 static int 2815 ufs_dirpurgedotdot( 2816 struct inode *ip, 2817 ino_t parentino, 2818 struct cred *cr) 2819 { 2820 return (ufs_dirscan(ip, parentino, cr, 1)); 2821 } 2822 2823 /* 2824 * Scan the directoy. If clr_dotdot is true clear the .. 2825 * directory else check to see if the directory is empty. 2826 * 2827 * Using a struct dirtemplate here is not precisely 2828 * what we want, but better than using a struct direct. 2829 * 2830 * clr_dotdot is used as a flag to tell us if we need 2831 * to clear the dotdot entry 2832 * 2833 * N.B.: does not handle corrupted directories. 2834 */ 2835 static int 2836 ufs_dirscan( 2837 struct inode *ip, 2838 ino_t parentino, 2839 struct cred *cr, 2840 int clr_dotdot) 2841 { 2842 offset_t off; 2843 struct dirtemplate dbuf; 2844 struct direct *dp = (struct direct *)&dbuf; 2845 int err, count; 2846 int empty = 1; /* Assume it's empty */ 2847 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 2848 2849 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 2850 2851 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 2852 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 2853 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 2854 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 2855 /* 2856 * Since we read MINDIRSIZ, residual must 2857 * be 0 unless we're at end of file. 2858 */ 2859 if (err || count != 0 || dp->d_reclen == 0) { 2860 empty = 0; 2861 break; 2862 } 2863 /* skip empty entries */ 2864 if (dp->d_ino == 0) 2865 continue; 2866 /* accept only "." and ".." */ 2867 if (dp->d_namlen > 2 || dp->d_name[0] != '.') { 2868 empty = 0; 2869 break; 2870 } 2871 /* 2872 * At this point d_namlen must be 1 or 2. 2873 * 1 implies ".", 2 implies ".." if second 2874 * char is also "." 2875 */ 2876 if (dp->d_namlen == 1) 2877 continue; 2878 if (dp->d_name[1] == '.' && 2879 (ino_t)dp->d_ino == parentino) { 2880 /* 2881 * If we're doing a purge we need to check for 2882 * the . and .. entries and clear the d_ino for .. 2883 * 2884 * if clr_dotdot is set ufs_dirscan does not 2885 * check for an empty directory. 2886 */ 2887 if (clr_dotdot) { 2888 /* 2889 * Have to actually zap the .. 2890 * entry in the directory, as 2891 * otherwise someone might have 2892 * dp as its cwd and try to 2893 * open .., which now points to 2894 * an unallocated inode. 2895 */ 2896 empty = ufs_dirclrdotdot(ip, parentino); 2897 break; 2898 } else { 2899 continue; 2900 } 2901 } 2902 empty = 0; 2903 break; 2904 } 2905 return (empty); 2906 } 2907 2908 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */ 2909 uint64_t dircheck_retry_cnt; 2910 /* 2911 * Check if source directory inode is in the path of the target directory. 2912 * Target is supplied locked. 2913 * 2914 * The source and target inode's should be different upon entry. 2915 */ 2916 int 2917 ufs_dircheckpath( 2918 ino_t source_ino, 2919 struct inode *target, 2920 struct inode *sdp, 2921 struct cred *cr) 2922 { 2923 struct fbuf *fbp; 2924 struct dirtemplate *dirp; 2925 struct inode *ip; 2926 struct ufsvfs *ufsvfsp; 2927 struct inode *tip; 2928 ino_t dotdotino; 2929 int err; 2930 2931 ASSERT(target->i_ufsvfs != NULL); 2932 ASSERT(RW_LOCK_HELD(&target->i_rwlock)); 2933 ASSERT(RW_LOCK_HELD(&sdp->i_rwlock)); 2934 2935 ip = target; 2936 if (ip->i_number == source_ino) { 2937 err = EINVAL; 2938 goto out; 2939 } 2940 if (ip->i_number == UFSROOTINO) { 2941 err = 0; 2942 goto out; 2943 } 2944 /* 2945 * Search back through the directory tree, using the ".." entries. 2946 * Fail any attempt to move a directory into an ancestor directory. 2947 */ 2948 fbp = NULL; 2949 for (;;) { 2950 struct vfs *vfs; 2951 2952 err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp); 2953 if (err) 2954 break; 2955 if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 || 2956 ip->i_size < sizeof (struct dirtemplate)) { 2957 dirbad(ip, "bad size, unlinked or not dir", (off_t)0); 2958 err = ENOTDIR; 2959 break; 2960 } 2961 if (dirp->dotdot_namlen != 2 || 2962 dirp->dotdot_name[0] != '.' || 2963 dirp->dotdot_name[1] != '.') { 2964 dirbad(ip, "mangled .. entry", (off_t)0); 2965 err = ENOTDIR; /* Sanity check */ 2966 break; 2967 } 2968 dotdotino = (ino_t)dirp->dotdot_ino; 2969 if (dotdotino == source_ino) { 2970 err = EINVAL; 2971 break; 2972 } 2973 if (dotdotino == UFSROOTINO) 2974 break; 2975 if (fbp) { 2976 fbrelse(fbp, S_OTHER); 2977 fbp = NULL; 2978 } 2979 vfs = ip->i_vfs; 2980 ufsvfsp = ip->i_ufsvfs; 2981 2982 if (ip != target) { 2983 rw_exit(&ip->i_rwlock); 2984 VN_RELE(ITOV(ip)); 2985 } 2986 /* 2987 * Race to get the inode. 2988 */ 2989 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 2990 if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) { 2991 rw_exit(&ufsvfsp->vfs_dqrwlock); 2992 ip = NULL; 2993 break; 2994 } 2995 rw_exit(&ufsvfsp->vfs_dqrwlock); 2996 /* 2997 * If the directory of the source inode (also a directory) 2998 * is the same as this next entry up the chain, then 2999 * we know the source directory itself can't be in the 3000 * chain. This also prevents a panic because we already 3001 * have sdp->i_rwlock locked. 3002 */ 3003 if (tip == sdp) { 3004 VN_RELE(ITOV(tip)); 3005 ip = NULL; 3006 break; 3007 } 3008 ip = tip; 3009 3010 /* 3011 * If someone has set the WRITE_WANTED bit in this lock and if 3012 * this happens to be a sdp or tdp of another parallel rename 3013 * which is executing the same code and in similar situation 3014 * we end up in a 4 way deadlock. We need to make sure that 3015 * the WRITE_WANTED bit is not set. 3016 */ 3017 retry_lock: 3018 if (!rw_tryenter(&ip->i_rwlock, RW_READER)) { 3019 /* 3020 * If the lock held as WRITER thats fine but if it 3021 * has WRITE_WANTED bit set we might end up in a 3022 * deadlock. If WRITE_WANTED is set we return 3023 * with EAGAIN else we just go back and try. 3024 */ 3025 if (RW_ISWRITER(&ip->i_rwlock) && 3026 !(RW_WRITE_HELD(&ip->i_rwlock))) { 3027 err = EAGAIN; 3028 if (fbp) { 3029 fbrelse(fbp, S_OTHER); 3030 } 3031 VN_RELE(ITOV(ip)); 3032 return (err); 3033 } else { 3034 /* 3035 * The lock is being write held. We could 3036 * just do a rw_enter here but there is a 3037 * window between the check and now, where 3038 * the status could have changed, so to 3039 * avoid looping we backoff and go back to 3040 * try for the lock. 3041 */ 3042 delay(retry_backoff_delay); 3043 dircheck_retry_cnt++; 3044 goto retry_lock; 3045 } 3046 } 3047 } 3048 if (fbp) { 3049 fbrelse(fbp, S_OTHER); 3050 } 3051 out: 3052 if (ip) { 3053 if (ip != target) { 3054 rw_exit(&ip->i_rwlock); 3055 VN_RELE(ITOV(ip)); 3056 } 3057 } 3058 return (err); 3059 } 3060 3061 int 3062 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr) 3063 { 3064 offset_t off; 3065 struct dirtemplate dbuf; 3066 struct direct *dp = (struct direct *)&dbuf; 3067 int err, count; 3068 int empty = 1; /* Assume it's empty */ 3069 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 3070 3071 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3072 3073 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 3074 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 3075 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 3076 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 3077 /* 3078 * Since we read MINDIRSIZ, residual must 3079 * be 0 unless we're at end of file. 3080 */ 3081 3082 if (err || count != 0 || dp->d_reclen == 0) { 3083 empty = 0; 3084 break; 3085 } 3086 /* skip empty entries */ 3087 if (dp->d_ino == 0) 3088 continue; 3089 /* 3090 * At this point d_namlen must be 1 or 2. 3091 * 1 implies ".", 2 implies ".." if second 3092 * char is also "." 3093 */ 3094 3095 if (dp->d_namlen == 1 && dp->d_name[0] == '.' && 3096 (ino_t)dp->d_ino == parentino) 3097 continue; 3098 3099 if (dp->d_namlen == 2 && dp->d_name[0] == '.' && 3100 dp->d_name[1] == '.') { 3101 continue; 3102 } 3103 empty = 0; 3104 break; 3105 } 3106 return (empty); 3107 } 3108 3109 3110 /* 3111 * Allocate and initialize a new shadow inode to contain extended attributes. 3112 */ 3113 int 3114 ufs_xattrmkdir( 3115 struct inode *tdp, 3116 struct inode **ipp, 3117 int flags, 3118 struct cred *cr) 3119 { 3120 struct inode *ip; 3121 struct vattr va; 3122 int err; 3123 int retry = 1; 3124 struct ufsvfs *ufsvfsp; 3125 struct ulockfs *ulp; 3126 int issync; 3127 int trans_size; 3128 int dorwlock; /* 0 = not yet taken, */ 3129 /* 1 = taken outside the transaction, */ 3130 /* 2 = taken inside the transaction */ 3131 3132 /* 3133 * Validate permission to create attribute directory 3134 */ 3135 3136 if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) { 3137 return (err); 3138 } 3139 3140 if (vn_is_readonly(ITOV(tdp))) 3141 return (EROFS); 3142 3143 /* 3144 * No need to re-init err after again:, since it's set before 3145 * the next use of it. 3146 */ 3147 again: 3148 dorwlock = 0; 3149 va.va_type = VDIR; 3150 va.va_uid = tdp->i_uid; 3151 va.va_gid = tdp->i_gid; 3152 3153 if ((tdp->i_mode & IFMT) == IFDIR) { 3154 va.va_mode = (o_mode_t)IFATTRDIR; 3155 va.va_mode |= tdp->i_mode & 0777; 3156 } else { 3157 va.va_mode = (o_mode_t)IFATTRDIR|0700; 3158 if (tdp->i_mode & 0040) 3159 va.va_mode |= 0750; 3160 if (tdp->i_mode & 0004) 3161 va.va_mode |= 0705; 3162 } 3163 va.va_mask = AT_TYPE|AT_MODE; 3164 3165 ufsvfsp = tdp->i_ufsvfs; 3166 3167 err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); 3168 if (err) 3169 return (err); 3170 3171 /* 3172 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. 3173 * This follows the protocol for read()/write(). 3174 */ 3175 if (ITOV(tdp)->v_type != VDIR) { 3176 rw_enter(&tdp->i_rwlock, RW_WRITER); 3177 dorwlock = 1; 3178 } 3179 3180 if (ulp) { 3181 trans_size = (int)TOP_MKDIR_SIZE(tdp); 3182 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size); 3183 } 3184 3185 /* 3186 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. 3187 * This follows the protocol established by 3188 * ufs_link/create/remove/rename/mkdir/rmdir/symlink. 3189 */ 3190 if (dorwlock == 0) { 3191 rw_enter(&tdp->i_rwlock, RW_WRITER); 3192 dorwlock = 2; 3193 } 3194 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3195 rw_enter(&tdp->i_contents, RW_WRITER); 3196 3197 /* 3198 * Suppress out of inodes messages if we will retry. 3199 */ 3200 if (retry) 3201 tdp->i_flag |= IQUIET; 3202 err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr); 3203 tdp->i_flag &= ~IQUIET; 3204 3205 if (err) 3206 goto fail; 3207 3208 if (flags) { 3209 3210 /* 3211 * Now attach it to src file. 3212 */ 3213 3214 tdp->i_oeftflag = ip->i_number; 3215 } 3216 3217 ip->i_cflags |= IXATTR; 3218 ITOV(ip)->v_flag |= V_XATTRDIR; 3219 TRANS_INODE(ufsvfsp, tdp); 3220 tdp->i_flag |= ICHG | IUPD; 3221 tdp->i_seq++; 3222 ufs_iupdat(tdp, I_SYNC); 3223 rw_exit(&tdp->i_contents); 3224 rw_exit(&ufsvfsp->vfs_dqrwlock); 3225 3226 rw_enter(&ip->i_rwlock, RW_WRITER); 3227 rw_enter(&ip->i_contents, RW_WRITER); 3228 TRANS_INODE(ufsvfsp, ip); 3229 ip->i_flag |= ICHG| IUPD; 3230 ip->i_seq++; 3231 ufs_iupdat(ip, I_SYNC); 3232 rw_exit(&ip->i_contents); 3233 rw_exit(&ip->i_rwlock); 3234 if (dorwlock == 2) 3235 rw_exit(&tdp->i_rwlock); 3236 if (ulp) { 3237 int terr = 0; 3238 3239 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3240 ufs_lockfs_end(ulp); 3241 if (err == 0) 3242 err = terr; 3243 } 3244 if (dorwlock == 1) 3245 rw_exit(&tdp->i_rwlock); 3246 *ipp = ip; 3247 return (err); 3248 3249 fail: 3250 rw_exit(&tdp->i_contents); 3251 rw_exit(&ufsvfsp->vfs_dqrwlock); 3252 if (dorwlock == 2) 3253 rw_exit(&tdp->i_rwlock); 3254 if (ulp) { 3255 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3256 ufs_lockfs_end(ulp); 3257 } 3258 if (dorwlock == 1) 3259 rw_exit(&tdp->i_rwlock); 3260 if (ip != NULL) 3261 VN_RELE(ITOV(ip)); 3262 3263 /* 3264 * No inodes? See if any are tied up in pending deletions. 3265 * This has to be done outside of any of the above, because 3266 * the draining operation can't be done from inside a transaction. 3267 */ 3268 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3269 ufs_delete_drain_wait(ufsvfsp, 1); 3270 retry = 0; 3271 goto again; 3272 } 3273 3274 return (err); 3275 } 3276 3277 /* 3278 * clear the dotdot directory entry. 3279 * Used by ufs_dirscan when clr_dotdot 3280 * flag is set and we're deleting a 3281 * directory. 3282 */ 3283 static int 3284 ufs_dirclrdotdot(struct inode *ip, ino_t parentino) 3285 { 3286 struct fbuf *fbp; 3287 struct direct *dotp, *dotdotp; 3288 int err = 0; 3289 3290 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 3291 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3292 err = blkatoff(ip, 0, NULL, &fbp); 3293 if (err) { 3294 return (err); 3295 } 3296 3297 dotp = (struct direct *)fbp->fb_addr; 3298 if ((dotp->d_namlen < (MAXNAMLEN + 1)) && 3299 ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) { 3300 dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen); 3301 if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) && 3302 ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) { 3303 3304 dotp->d_reclen += dotdotp->d_reclen; 3305 if (parentino == dotdotp->d_ino) { 3306 dotdotp->d_ino = 0; 3307 dotdotp->d_namlen = 0; 3308 dotdotp->d_reclen = 0; 3309 } 3310 3311 err = TRANS_DIR(ip, 0); 3312 if (err) { 3313 fbrelse(fbp, S_OTHER); 3314 } else { 3315 err = ufs_fbwrite(fbp, ip); 3316 } 3317 } 3318 } else { 3319 err = -1; 3320 } 3321 return (err); 3322 } 3323