1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 41 #pragma ident "%Z%%M% %I% %E% SMI" 42 43 /* 44 * Directory manipulation routines. 45 * 46 * When manipulating directories, the i_rwlock provides serialization 47 * since directories cannot be mmapped. The i_contents lock is redundant. 48 */ 49 50 #include <sys/types.h> 51 #include <sys/t_lock.h> 52 #include <sys/param.h> 53 #include <sys/systm.h> 54 #include <sys/signal.h> 55 #include <sys/cred.h> 56 #include <sys/proc.h> 57 #include <sys/disp.h> 58 #include <sys/user.h> 59 #include <sys/vfs.h> 60 #include <sys/vnode.h> 61 #include <sys/stat.h> 62 #include <sys/mode.h> 63 #include <sys/buf.h> 64 #include <sys/uio.h> 65 #include <sys/dnlc.h> 66 #include <sys/fs/ufs_inode.h> 67 #include <sys/fs/ufs_fs.h> 68 #include <sys/mount.h> 69 #include <sys/fs/ufs_fsdir.h> 70 #include <sys/fs/ufs_trans.h> 71 #include <sys/fs/ufs_panic.h> 72 #include <sys/fs/ufs_quota.h> 73 #include <sys/errno.h> 74 #include <sys/debug.h> 75 #include <vm/seg.h> 76 #include <sys/sysmacros.h> 77 #include <sys/cmn_err.h> 78 #include <sys/cpuvar.h> 79 #include <sys/unistd.h> 80 #include <sys/policy.h> 81 82 /* 83 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ 84 */ 85 #if !ISP2(DIRBLKSIZ) 86 #error "DIRBLKSIZ not a power of 2" 87 #endif 88 89 /* 90 * A virgin directory. 91 */ 92 static struct dirtemplate mastertemplate = { 93 0, 12, 1, ".", 94 0, DIRBLKSIZ - 12, 2, ".." 95 }; 96 97 #define LDIRSIZ(len) \ 98 ((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3)) 99 #define MAX_DIR_NAME_LEN(len) \ 100 (((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1) 101 102 /* 103 * The dnlc directory cache allows a 64 bit handle for directory entries. 104 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset 105 * into the handle. Note, a 32 bit offset allows a 4GB directory, which 106 * is way beyond what could be cached in memory by the directory 107 * caching routines. So we are quite safe with this limit. 108 * The macros below pack and unpack the handle. 109 */ 110 #define H_TO_INO(h) (uint32_t)((h) & UINT_MAX) 111 #define H_TO_OFF(h) (off_t)((h) >> 32) 112 #define INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino)) 113 114 /* 115 * The average size of a typical on disk directory entry is about 16 bytes 116 * and so defines AV_DIRECT_SHIFT : log2(16) 117 * This define is only used to approximate the number of entries 118 * is a directory. This is needed for dnlc_dir_start() which will immediately 119 * return an error if the value is not within its acceptable range of 120 * number of files in a directory. 121 */ 122 #define AV_DIRECT_SHIFT 4 123 /* 124 * If the directory size (from i_size) is greater than the ufs_min_dir_cache 125 * tunable then we request dnlc directory caching. 126 * This has found to be profitable after 1024 file names. 127 */ 128 int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT; 129 130 #ifdef DEBUG 131 int dirchk = 1; 132 #else /* !DEBUG */ 133 int dirchk = 0; 134 #endif /* DEBUG */ 135 int ufs_negative_cache = 1; 136 uint64_t ufs_dirremove_retry_cnt; 137 138 static void dirbad(); 139 static int ufs_dircheckforname(); 140 static int ufs_dirrename(); 141 static int ufs_diraddentry(); 142 static int ufs_dirempty(); 143 static int ufs_dirscan(); 144 static int ufs_dirclrdotdot(); 145 static int ufs_dirfixdotdot(); 146 static int ufs_dirpurgedotdot(); 147 static int dirprepareentry(); 148 static int ufs_dirmakedirect(); 149 static int dirbadname(); 150 static int dirmangled(); 151 152 /* 153 * Look for a given name in a directory. On successful return, *ipp 154 * will point to the VN_HELD inode. 155 */ 156 int 157 ufs_dirlook( 158 struct inode *dp, 159 char *namep, 160 struct inode **ipp, 161 struct cred *cr, 162 int skipdnlc) /* skip the 1st level dnlc */ 163 { 164 uint64_t handle; 165 struct fbuf *fbp; /* a buffer of directory entries */ 166 struct direct *ep; /* the current directory entry */ 167 struct vnode *vp; 168 struct vnode *dvp; /* directory vnode ptr */ 169 dcanchor_t *dcap; 170 off_t endsearch; /* offset to end directory search */ 171 off_t offset; 172 off_t start_off; /* starting offset from middle search */ 173 off_t last_offset; /* last offset */ 174 int entryoffsetinblock; /* offset of ep in addr's buffer */ 175 int numdirpasses; /* strategy for directory search */ 176 int namlen; /* length of name */ 177 int err; 178 int doingchk; 179 int i; 180 int caching; 181 ino_t ep_ino; /* entry i number */ 182 ino_t chkino; 183 ushort_t ep_reclen; /* direct local d_reclen */ 184 185 ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */ 186 187 /* 188 * Check accessibility of directory. 189 */ 190 if (((dp->i_mode & IFMT) != IFDIR) && 191 ((dp->i_mode & IFMT) != IFATTRDIR)) 192 return (ENOTDIR); 193 194 if (err = ufs_iaccess(dp, IEXEC, cr)) 195 return (err); 196 197 /* 198 * Check the directory name lookup cache, first for individual files 199 * then for complete directories. 200 */ 201 dvp = ITOV(dp); 202 if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) { 203 /* vp is already held from dnlc_lookup */ 204 if (vp == DNLC_NO_VNODE) { 205 VN_RELE(vp); 206 return (ENOENT); 207 } 208 *ipp = VTOI(vp); 209 return (0); 210 } 211 212 dcap = &dp->i_danchor; 213 214 /* 215 * Grab the reader lock on the directory data before checking 216 * the dnlc to avoid a race with ufs_dirremove() & friends. 217 */ 218 rw_enter(&dp->i_rwlock, RW_READER); 219 220 switch (dnlc_dir_lookup(dcap, namep, &handle)) { 221 case DFOUND: 222 ep_ino = (ino_t)H_TO_INO(handle); 223 if (dp->i_number == ep_ino) { 224 VN_HOLD(dvp); /* want ourself, "." */ 225 *ipp = dp; 226 rw_exit(&dp->i_rwlock); 227 return (0); 228 } 229 if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) { 230 uint64_t handle2; 231 /* 232 * release the lock on the dir we are searching 233 * to avoid a deadlock when grabbing the 234 * i_contents lock in ufs_iget_alloced(). 235 */ 236 rw_exit(&dp->i_rwlock); 237 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 238 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 239 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 240 /* 241 * must recheck as we dropped dp->i_rwlock 242 */ 243 rw_enter(&dp->i_rwlock, RW_READER); 244 if (!err && (dnlc_dir_lookup(dcap, namep, &handle2) 245 == DFOUND) && (handle == handle2)) { 246 dnlc_update(dvp, namep, ITOV(*ipp)); 247 rw_exit(&dp->i_rwlock); 248 return (0); 249 } 250 /* check failed, read the actual directory */ 251 if (!err) { 252 VN_RELE(ITOV(*ipp)); 253 } 254 goto restart; 255 } 256 /* usual case of not "." nor ".." */ 257 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 258 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); 259 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 260 if (err) { 261 rw_exit(&dp->i_rwlock); 262 return (err); 263 } 264 dnlc_update(dvp, namep, ITOV(*ipp)); 265 rw_exit(&dp->i_rwlock); 266 return (0); 267 case DNOENT: 268 if (ufs_negative_cache && (dp->i_nlink > 0)) { 269 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 270 } 271 rw_exit(&dp->i_rwlock); 272 return (ENOENT); 273 default: 274 break; 275 } 276 restart: 277 278 fbp = NULL; 279 doingchk = 0; 280 chkino = 0; 281 caching = 0; 282 283 /* 284 * Attempt to cache any directories greater than 285 * the tunable ufs_min_cache_dir. 286 */ 287 if ((dp->i_size >= ufs_min_dir_cache) && (dp->i_cachedir)) { 288 switch (dnlc_dir_start(dcap, dp->i_size >> AV_DIRECT_SHIFT)) { 289 case DNOMEM: 290 case DTOOBIG: 291 dp->i_cachedir = 0; 292 break; 293 case DOK: 294 caching = 1; 295 break; 296 default: 297 break; 298 } 299 } 300 /* 301 * If caching we don't stop when the file has been 302 * found, but need to know later, so clear *ipp now 303 */ 304 *ipp = NULL; 305 306 recheck: 307 if (caching) { 308 offset = 0; 309 entryoffsetinblock = 0; 310 numdirpasses = 1; 311 } else { 312 /* 313 * Take care to look at dp->i_diroff only once, as it 314 * may be changing due to other threads/cpus. 315 */ 316 offset = dp->i_diroff; 317 if (offset > dp->i_size) { 318 offset = 0; 319 } 320 if (offset == 0) { 321 entryoffsetinblock = 0; 322 numdirpasses = 1; 323 } else { 324 start_off = offset; 325 326 entryoffsetinblock = blkoff(dp->i_fs, offset); 327 if (entryoffsetinblock != 0) { 328 err = blkatoff(dp, offset, (char **)0, &fbp); 329 if (err) 330 goto bad; 331 } 332 numdirpasses = 2; 333 } 334 } 335 endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t); 336 namlen = strlen(namep); 337 last_offset = 0; 338 339 searchloop: 340 while (offset < endsearch) { 341 /* 342 * If offset is on a block boundary, 343 * read the next directory block. 344 * Release previous if it exists. 345 */ 346 if (blkoff(dp->i_fs, offset) == 0) { 347 if (fbp != NULL) { 348 fbrelse(fbp, S_OTHER); 349 } 350 err = blkatoff(dp, offset, (char **)0, &fbp); 351 if (err) 352 goto bad; 353 entryoffsetinblock = 0; 354 } 355 356 /* 357 * If the offset to the next entry is invalid or if the 358 * next entry is a zero length record or if the record 359 * length is invalid, then skip to the next directory 360 * block. Complete validation checks are done if the 361 * record length is invalid. 362 * 363 * Full validation checks are slow so they are disabled 364 * by default. Complete checks can be run by patching 365 * "dirchk" to be true. 366 * 367 * We have to check the validity of entryoffsetinblock 368 * here because it can be set to i_diroff above. 369 */ 370 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock); 371 if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 || 372 (dirchk || (ep->d_reclen & 0x3)) && 373 dirmangled(dp, ep, entryoffsetinblock, offset)) { 374 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 375 offset += i; 376 entryoffsetinblock += i; 377 if (caching) { 378 dnlc_dir_purge(dcap); 379 caching = 0; 380 } 381 continue; 382 } 383 384 ep_reclen = ep->d_reclen; 385 386 /* 387 * Add named entries and free space into the directory cache 388 */ 389 if (caching) { 390 ushort_t extra; 391 off_t off2; 392 393 if (ep->d_ino == 0) { 394 extra = ep_reclen; 395 if (offset & (DIRBLKSIZ - 1)) { 396 dnlc_dir_purge(dcap); 397 dp->i_cachedir = 0; 398 caching = 0; 399 } 400 } else { 401 /* 402 * entries hold the previous offset except the 403 * 1st which holds the offset + 1 404 */ 405 if (offset & (DIRBLKSIZ - 1)) { 406 off2 = last_offset; 407 } else { 408 off2 = offset + 1; 409 } 410 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 411 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 412 extra = ep_reclen - DIRSIZ(ep); 413 } 414 if (caching && (extra >= LDIRSIZ(1))) { 415 caching = (dnlc_dir_add_space(dcap, extra, 416 (uint64_t)offset) == DOK); 417 } 418 } 419 420 /* 421 * Check for a name match. 422 * We have the parent inode read locked with i_rwlock. 423 */ 424 if (ep->d_ino && ep->d_namlen == namlen && 425 *namep == *ep->d_name && /* fast chk 1st chr */ 426 bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) { 427 428 /* 429 * We have to release the fbp early here to avoid 430 * a possible deadlock situation where we have the 431 * fbp and want the directory inode and someone doing 432 * a ufs_direnter_* has the directory inode and wants 433 * the fbp. XXX - is this still needed? 434 */ 435 ep_ino = (ino_t)ep->d_ino; 436 ASSERT(fbp != NULL); 437 fbrelse(fbp, S_OTHER); 438 fbp = NULL; 439 440 /* 441 * Atomic update (read lock held) 442 */ 443 dp->i_diroff = offset; 444 445 if (namlen == 2 && namep[0] == '.' && namep[1] == '.') { 446 struct timeval32 omtime; 447 448 if (caching) { 449 dnlc_dir_purge(dcap); 450 caching = 0; 451 } 452 if (doingchk) { 453 /* 454 * if the inumber didn't change 455 * continue with already found inode. 456 */ 457 if (ep_ino == chkino) 458 goto checkok; 459 else { 460 VN_RELE(ITOV(*ipp)); 461 /* *ipp is nulled at restart */ 462 goto restart; 463 } 464 } 465 /* 466 * release the lock on the dir we are searching 467 * to avoid a deadlock when grabbing the 468 * i_contents lock in ufs_iget_alloced(). 469 */ 470 omtime = dp->i_mtime; 471 rw_exit(&dp->i_rwlock); 472 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 473 RW_READER); 474 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 475 cr); 476 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 477 rw_enter(&dp->i_rwlock, RW_READER); 478 if (err) 479 goto bad; 480 /* 481 * Since we released the lock on the directory, 482 * we must check that the same inode is still 483 * the ".." entry for this directory. 484 */ 485 /*CSTYLED*/ 486 if (timercmp(&omtime, &dp->i_mtime, !=)) { 487 /* 488 * Modification time changed on the 489 * directory, we must go check if 490 * the inumber changed for ".." 491 */ 492 doingchk = 1; 493 chkino = ep_ino; 494 entryoffsetinblock = 0; 495 if (caching) { 496 /* 497 * Forget directory caching 498 * for this rare case 499 */ 500 dnlc_dir_purge(dcap); 501 caching = 0; 502 } 503 goto recheck; 504 } 505 } else if (dp->i_number == ep_ino) { 506 VN_HOLD(dvp); /* want ourself, "." */ 507 *ipp = dp; 508 if (caching) { 509 dnlc_dir_purge(dcap); 510 caching = 0; 511 } 512 } else { 513 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, 514 RW_READER); 515 err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, 516 cr); 517 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 518 if (err) 519 goto bad; 520 } 521 checkok: 522 ASSERT(*ipp); 523 dnlc_update(dvp, namep, ITOV(*ipp)); 524 /* 525 * If we are not caching then just return the entry 526 * otherwise complete loading up the cache 527 */ 528 if (!caching) { 529 rw_exit(&dp->i_rwlock); 530 return (0); 531 } 532 err = blkatoff(dp, offset, (char **)0, &fbp); 533 if (err) 534 goto bad; 535 } 536 last_offset = offset; 537 offset += ep_reclen; 538 entryoffsetinblock += ep_reclen; 539 } 540 /* 541 * If we started in the middle of the directory and failed 542 * to find our target, we must check the beginning as well. 543 */ 544 if (numdirpasses == 2) { 545 numdirpasses--; 546 offset = 0; 547 endsearch = start_off; 548 goto searchloop; 549 } 550 551 /* 552 * If whole directory caching is on (or was originally on) then 553 * the entry may have been found. 554 */ 555 if (*ipp == NULL) { 556 err = ENOENT; 557 if (ufs_negative_cache && (dp->i_nlink > 0)) { 558 dnlc_enter(dvp, namep, DNLC_NO_VNODE); 559 } 560 } 561 if (caching) { 562 dnlc_dir_complete(dcap); 563 caching = 0; 564 } 565 566 bad: 567 if (err && *ipp) { 568 /* 569 * err and *ipp can both be set if we were attempting to 570 * cache the directory, and we found the entry, then later 571 * while trying to complete the directory cache encountered 572 * a error (eg reading a directory sector). 573 */ 574 VN_RELE(ITOV(*ipp)); 575 *ipp = NULL; 576 } 577 578 if (fbp) 579 fbrelse(fbp, S_OTHER); 580 rw_exit(&dp->i_rwlock); 581 if (caching) 582 dnlc_dir_purge(dcap); 583 return (err); 584 } 585 586 /* 587 * If ufs_dircheckforname() fails to find an entry with the given name, 588 * this "slot" structure holds state for ufs_direnter_*() as to where 589 * there is space to put an entry with that name. 590 * If ufs_dircheckforname() finds an entry with the given name, this structure 591 * holds state for ufs_dirrename() and ufs_dirremove() as to where the 592 * entry is. "status" indicates what ufs_dircheckforname() found: 593 * NONE name not found, large enough free slot not found, 594 * FOUND name not found, large enough free slot found 595 * EXIST name found 596 * If ufs_dircheckforname() fails due to an error, this structure is not 597 * filled in. 598 * 599 * After ufs_dircheckforname() succeeds the values are: 600 * status offset size fbp, ep 601 * ------ ------ ---- ------- 602 * NONE end of dir needed not valid 603 * FOUND start of entry of ent both valid if fbp != NULL 604 * EXIST start of entry of prev ent valid 605 * 606 * "endoff" is set to 0 if the an entry with the given name is found, or if no 607 * free slot could be found or made; this means that the directory should not 608 * be truncated. If the entry was found, the search terminates so 609 * ufs_dircheckforname() didn't find out where the last valid entry in the 610 * directory was, so it doesn't know where to cut the directory off; if no free 611 * slot could be found or made, the directory has to be extended to make room 612 * for the new entry, so there's nothing to cut off. 613 * Otherwise, "endoff" is set to the larger of the offset of the last 614 * non-empty entry in the directory, or the offset at which the new entry will 615 * be placed, whichever is larger. This is used by ufs_diraddentry(); if a new 616 * entry is to be added to the directory, any complete directory blocks at the 617 * end of the directory that contain no non-empty entries are lopped off the 618 * end, thus shrinking the directory dynamically. 619 */ 620 typedef enum {NONE, FOUND, EXIST} slotstat_t; 621 struct slot { 622 struct direct *ep; /* pointer to slot */ 623 struct fbuf *fbp; /* dir buf where slot is */ 624 off_t offset; /* offset of area with free space */ 625 off_t endoff; /* last useful location found in search */ 626 slotstat_t status; /* status of slot */ 627 int size; /* size of area at slotoffset */ 628 int cached; /* cached directory */ 629 }; 630 631 632 /* 633 * Write a new directory entry for DE_CREATE or DE_MKDIR operations. 634 */ 635 int 636 ufs_direnter_cm( 637 struct inode *tdp, /* target directory to make entry in */ 638 char *namep, /* name of entry */ 639 enum de_op op, /* entry operation */ 640 struct vattr *vap, /* attributes if new inode needed */ 641 struct inode **ipp, /* return entered inode here */ 642 struct cred *cr, /* user credentials */ 643 int flags) /* no entry exists */ 644 { 645 struct inode *tip; /* inode of (existing) target file */ 646 char *s; 647 struct slot slot; /* slot info to pass around */ 648 int namlen; /* length of name */ 649 int err; /* error number */ 650 struct inode *nip; /* new inode */ 651 int do_rele_nip = 0; /* release nip */ 652 int noentry = flags & ~IQUIET; 653 int quiet = flags & IQUIET; /* Suppress out of inodes message */ 654 655 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 656 657 if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) || 658 ((vap->va_type == VCHR) || (vap->va_type == VBLK) || 659 (vap->va_type == VDOOR) || (vap->va_type == VSOCK) || 660 (vap->va_type == VFIFO)))) 661 return (EINVAL); 662 663 /* don't allow '/' characters in pathname component */ 664 for (s = namep, namlen = 0; *s; s++, namlen++) 665 if (*s == '/') 666 return (EACCES); 667 ASSERT(namlen); 668 669 /* 670 * If name is "." or ".." then if this is a create look it up 671 * and return EEXIST. 672 */ 673 if (namep[0] == '.' && 674 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 675 /* 676 * ufs_dirlook will acquire the i_rwlock 677 */ 678 rw_exit(&tdp->i_rwlock); 679 if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) { 680 rw_enter(&tdp->i_rwlock, RW_WRITER); 681 return (err); 682 } 683 rw_enter(&tdp->i_rwlock, RW_WRITER); 684 return (EEXIST); 685 } 686 687 /* 688 * If target directory has not been removed, then we can consider 689 * allowing file to be created. 690 */ 691 if (tdp->i_nlink <= 0) { 692 return (ENOENT); 693 } 694 695 /* 696 * Check accessibility of directory. 697 */ 698 if (((tdp->i_mode & IFMT) != IFDIR) && 699 ((tdp->i_mode & IFMT) != IFATTRDIR)) { 700 return (ENOTDIR); 701 } 702 703 /* 704 * Execute access is required to search the directory. 705 */ 706 if (err = ufs_iaccess(tdp, IEXEC, cr)) { 707 return (err); 708 } 709 710 /* 711 * Search for the entry. Return VN_HELD tip if found. 712 */ 713 tip = NULL; 714 slot.fbp = NULL; 715 slot.status = NONE; 716 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 717 rw_enter(&tdp->i_contents, RW_WRITER); 718 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry); 719 if (err) 720 goto out; 721 if (tip) { 722 ASSERT(!noentry); 723 *ipp = tip; 724 err = EEXIST; 725 } else { 726 /* 727 * The entry does not exist. Check write permission in 728 * directory to see if entry can be created. 729 */ 730 if (err = ufs_iaccess(tdp, IWRITE, cr)) 731 goto out; 732 /* 733 * Make new inode and directory entry. 734 */ 735 tdp->i_flag |= quiet; 736 if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) { 737 if (nip != NULL) 738 do_rele_nip = 1; 739 goto out; 740 } 741 if (err = ufs_diraddentry(tdp, namep, op, 742 namlen, &slot, nip, NULL, cr)) { 743 /* 744 * Unmake the inode we just made. 745 */ 746 rw_enter(&nip->i_contents, RW_WRITER); 747 if (((nip->i_mode & IFMT) == IFDIR) || 748 ((nip->i_mode & IFMT) == IFATTRDIR)) { 749 tdp->i_nlink--; 750 ufs_setreclaim(tdp); 751 tdp->i_flag |= ICHG; 752 tdp->i_seq++; 753 TRANS_INODE(tdp->i_ufsvfs, tdp); 754 ITIMES_NOLOCK(tdp); 755 } 756 nip->i_nlink = 0; 757 ufs_setreclaim(nip); 758 TRANS_INODE(nip->i_ufsvfs, nip); 759 nip->i_flag |= ICHG; 760 nip->i_seq++; 761 ITIMES_NOLOCK(nip); 762 rw_exit(&nip->i_contents); 763 do_rele_nip = 1; 764 } else { 765 *ipp = nip; 766 } 767 } 768 769 out: 770 if (slot.fbp) 771 fbrelse(slot.fbp, S_OTHER); 772 773 tdp->i_flag &= ~quiet; 774 rw_exit(&tdp->i_contents); 775 776 /* 777 * Drop vfs_dqrwlock before calling VN_RELE() on nip to 778 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 779 */ 780 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 781 782 if (do_rele_nip) { 783 VN_RELE(ITOV(nip)); 784 } 785 786 return (err); 787 } 788 789 /* 790 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations. 791 * If tvpp is non-null, return with the pointer to the target vnode. 792 */ 793 int 794 ufs_direnter_lr( 795 struct inode *tdp, /* target directory to make entry in */ 796 char *namep, /* name of entry */ 797 enum de_op op, /* entry operation */ 798 struct inode *sdp, /* source inode parent if rename */ 799 struct inode *sip, /* source inode */ 800 struct cred *cr, /* user credentials */ 801 vnode_t **tvpp) /* Return: (held) vnode of (existing) target */ 802 { 803 struct inode *tip; /* inode of (existing) target file */ 804 char *s; 805 struct slot slot; /* slot info to pass around */ 806 int namlen; /* length of name */ 807 int err; /* error number */ 808 809 /* don't allow '/' characters in pathname component */ 810 for (s = namep, namlen = 0; *s; s++, namlen++) 811 if (*s == '/') 812 return (EACCES); 813 ASSERT(namlen); 814 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 815 816 /* 817 * If name is "." or ".." then if this is a create look it up 818 * and return EEXIST. Rename or link TO "." or ".." is forbidden. 819 */ 820 if (namep[0] == '.' && 821 (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { 822 if (op == DE_RENAME) { 823 return (EINVAL); /* *SIGH* should be ENOTEMPTY */ 824 } 825 return (EEXIST); 826 } 827 /* 828 * For link and rename lock the source entry and check the link count 829 * to see if it has been removed while it was unlocked. If not, we 830 * increment the link count and force the inode to disk to make sure 831 * that it is there before any directory entry that points to it. 832 * 833 * In the case of a symbolic link, we are dealing with a new inode 834 * which does not yet have any links. We've created it with a link 835 * count of 1, and we don't want to increment it since this will be 836 * its first link. 837 * 838 * We are about to push the inode to disk. We make sure 839 * that the inode's data blocks are flushed first so the 840 * inode and it's data blocks are always in sync. This 841 * adds some robustness in in the event of a power failure 842 * or panic where sync fails. If we panic before the 843 * inode is updated, then the inode still refers to the 844 * old data blocks (or none for a new file). If we panic 845 * after the inode is updated, then the inode refers to 846 * the new data blocks. 847 * 848 * We do this before grabbing the i_contents lock because 849 * ufs_syncip() will want that lock. We could do the data 850 * syncing after the removal checks, but upon return from 851 * the data sync we would have to repeat the removal 852 * checks. 853 */ 854 if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) { 855 return (err); 856 } 857 858 rw_enter(&sip->i_contents, RW_WRITER); 859 if (sip->i_nlink <= 0) { 860 rw_exit(&sip->i_contents); 861 return (ENOENT); 862 } 863 if (sip->i_nlink == MAXLINK) { 864 rw_exit(&sip->i_contents); 865 return (EMLINK); 866 } 867 868 /* 869 * Sync the indirect blocks associated with the file 870 * for the same reasons as described above. Since this 871 * call wants the i_contents lock held for it we can do 872 * this here with no extra work. 873 */ 874 if (err = ufs_sync_indir(sip)) { 875 rw_exit(&sip->i_contents); 876 return (err); 877 } 878 879 if (op != DE_SYMLINK) 880 sip->i_nlink++; 881 TRANS_INODE(sip->i_ufsvfs, sip); 882 sip->i_flag |= ICHG; 883 sip->i_seq++; 884 ufs_iupdat(sip, I_SYNC); 885 rw_exit(&sip->i_contents); 886 887 /* 888 * If target directory has not been removed, then we can consider 889 * allowing file to be created. 890 */ 891 if (tdp->i_nlink <= 0) { 892 err = ENOENT; 893 goto out2; 894 } 895 /* 896 * Check accessibility of directory. 897 */ 898 if (((tdp->i_mode & IFMT) != IFDIR) && 899 (tdp->i_mode & IFMT) != IFATTRDIR) { 900 err = ENOTDIR; 901 goto out2; 902 } 903 /* 904 * Execute access is required to search the directory. 905 */ 906 if (err = ufs_iaccess(tdp, IEXEC, cr)) { 907 goto out2; 908 } 909 910 /* 911 * Search for the entry. Return VN_HELD tip if found. 912 */ 913 tip = NULL; 914 slot.status = NONE; 915 slot.fbp = NULL; 916 rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); 917 rw_enter(&tdp->i_contents, RW_WRITER); 918 err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0); 919 if (err) 920 goto out; 921 922 if (tip) { 923 switch (op) { 924 case DE_RENAME: 925 err = ufs_dirrename(sdp, sip, tdp, namep, 926 tip, &slot, cr); 927 break; 928 929 case DE_LINK: 930 case DE_SYMLINK: 931 /* 932 * Can't link to an existing file. 933 */ 934 err = EEXIST; 935 break; 936 default: 937 break; 938 } 939 } else { 940 /* 941 * The entry does not exist. Check write permission in 942 * directory to see if entry can be created. 943 */ 944 if (err = ufs_iaccess(tdp, IWRITE, cr)) 945 goto out; 946 err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp, 947 cr); 948 } 949 950 out: 951 if (slot.fbp) 952 fbrelse(slot.fbp, S_OTHER); 953 954 rw_exit(&tdp->i_contents); 955 956 /* 957 * Drop vfs_dqrwlock before calling VN_RELE() on tip to 958 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 959 */ 960 rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); 961 962 /* 963 * If we renamed a file over the top of an existing file, 964 * or linked a file to an existing file (or tried to), 965 * then set *tvpp to the target vnode, if tvpp is non-null 966 * otherwise, release and delete (or just release) the inode. 967 * 968 * N.B., by returning the target's vnode pointer to the caller, 969 * that caller becomes responsible for doing the VN_RELE. 970 */ 971 if (tip) { 972 if ((err == 0) && (tvpp != NULL)) { 973 *tvpp = ITOV(tip); 974 } else { 975 VN_RELE(ITOV(tip)); 976 } 977 } 978 979 out2: 980 if (err) { 981 /* 982 * Undo bumped link count. 983 */ 984 if (op != DE_SYMLINK) { 985 rw_enter(&sip->i_contents, RW_WRITER); 986 sip->i_nlink--; 987 ufs_setreclaim(sip); 988 TRANS_INODE(sip->i_ufsvfs, sip); 989 sip->i_flag |= ICHG; 990 sip->i_seq++; 991 ITIMES_NOLOCK(sip); 992 rw_exit(&sip->i_contents); 993 } 994 } 995 return (err); 996 } 997 998 /* 999 * Check for the existence of a name in a directory (unless noentry 1000 * is set) , or else of an empty 1001 * slot in which an entry may be made. If the requested name is found, 1002 * then on return *ipp points at the inode and *offp contains 1003 * its offset in the directory. If the name is not found, then *ipp 1004 * will be NULL and *slotp will contain information about a directory slot in 1005 * which an entry may be made (either an empty slot, or the first position 1006 * past the end of the directory). 1007 * The target directory inode (tdp) is supplied write locked (i_rwlock). 1008 * 1009 * This may not be used on "." or "..", but aliases of "." are ok. 1010 */ 1011 static int 1012 ufs_dircheckforname( 1013 struct inode *tdp, /* inode of directory being checked */ 1014 char *namep, /* name we're checking for */ 1015 int namlen, /* length of name, excluding null */ 1016 struct slot *slotp, /* slot structure */ 1017 struct inode **ipp, /* return inode if we find one */ 1018 struct cred *cr, 1019 int noentry) /* noentry - just look for space */ 1020 { 1021 uint64_t handle; 1022 struct fbuf *fbp; /* pointer to directory block */ 1023 struct direct *ep; /* directory entry */ 1024 struct direct *nep; /* next directory entry */ 1025 dcanchor_t *dcap; 1026 vnode_t *dvp; /* directory vnode ptr */ 1027 off_t dirsize; /* size of the directory */ 1028 off_t offset; /* offset in the directory */ 1029 off_t last_offset; /* last offset */ 1030 off_t enduseful; /* pointer past last used dir slot */ 1031 int entryoffsetinblk; /* offset of ep in fbp's buffer */ 1032 int i; /* length of mangled entry */ 1033 int needed; 1034 int err; 1035 int first; 1036 int caching; 1037 int stat; 1038 ino_t ep_ino; 1039 slotstat_t initstat = slotp->status; 1040 1041 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1042 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1043 ASSERT(*ipp == NULL); 1044 fbp = NULL; 1045 1046 /* 1047 * First check if there is a complete cache of the directory. 1048 */ 1049 dvp = ITOV(tdp); 1050 1051 dcap = &tdp->i_danchor; 1052 if (noentry) { 1053 /* 1054 * We know from the 1st level dnlc cache that the entry 1055 * doesn't exist, so don't bother searching the directory 1056 * cache, but just look for space (possibly in the directory 1057 * cache). 1058 */ 1059 stat = DNOENT; 1060 } else { 1061 stat = dnlc_dir_lookup(dcap, namep, &handle); 1062 } 1063 switch (stat) { 1064 case DFOUND: 1065 ep_ino = (ino_t)H_TO_INO(handle); 1066 if (tdp->i_number == ep_ino) { 1067 *ipp = tdp; /* we want ourself, ie "." */ 1068 VN_HOLD(dvp); 1069 } else { 1070 err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr); 1071 if (err) 1072 return (err); 1073 } 1074 offset = H_TO_OFF(handle); 1075 first = 0; 1076 if (offset & 1) { 1077 /* This is the first entry in the block */ 1078 first = 1; 1079 offset -= 1; 1080 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1081 } 1082 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1083 if (err) { 1084 VN_RELE(ITOV(*ipp)); 1085 *ipp = NULL; 1086 return (err); 1087 } 1088 /* 1089 * Check the validity of the entry. 1090 * If it's bad, then throw away the cache and 1091 * continue without it. The dirmangled() routine 1092 * will then be called upon it. 1093 */ 1094 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1095 VN_RELE(ITOV(*ipp)); 1096 *ipp = NULL; 1097 dnlc_dir_purge(dcap); 1098 break; 1099 } 1100 /* 1101 * Remember the returned offset is the offset of the 1102 * preceding record (unless this is the 1st record 1103 * in the DIRBLKSIZ sized block (disk sector)), then it's 1104 * offset + 1. Note, no real offsets are on odd boundaries. 1105 */ 1106 if (first) { 1107 ASSERT((offset & (DIRBLKSIZ - 1)) == 0); 1108 slotp->offset = offset; 1109 slotp->size = 0; 1110 slotp->ep = ep; 1111 } else { 1112 /* get the next entry */ 1113 nep = (struct direct *)((char *)ep + ep->d_reclen); 1114 /* 1115 * Check the validity of this entry as well 1116 * If it's bad, then throw away the cache and 1117 * continue without it. The dirmangled() routine 1118 * will then be called upon it. 1119 */ 1120 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1121 (nep->d_ino != ep_ino)) { 1122 VN_RELE(ITOV(*ipp)); 1123 *ipp = NULL; 1124 dnlc_dir_purge(dcap); 1125 break; 1126 } 1127 slotp->offset = offset + ep->d_reclen; 1128 slotp->size = ep->d_reclen; 1129 slotp->ep = nep; 1130 } 1131 slotp->status = EXIST; 1132 slotp->fbp = fbp; 1133 slotp->endoff = 0; 1134 slotp->cached = 1; 1135 dnlc_update(dvp, namep, ITOV(*ipp)); 1136 return (0); 1137 case DNOENT: 1138 /* 1139 * The caller gets to set the initial slot status to 1140 * indicate whether it's interested in getting a 1141 * empty slot. For example, the status can be set 1142 * to FOUND when an entry is being deleted. 1143 */ 1144 ASSERT(slotp->fbp == NULL); 1145 if (slotp->status == FOUND) { 1146 return (0); 1147 } 1148 switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen), 1149 &handle)) { 1150 case DFOUND: 1151 offset = (off_t)handle; 1152 err = blkatoff(tdp, offset, (char **)&ep, &fbp); 1153 if (err) { 1154 dnlc_dir_purge(dcap); 1155 ASSERT(*ipp == NULL); 1156 return (err); 1157 } 1158 /* 1159 * Check the validity of the entry. 1160 * If it's bad, then throw away the cache and 1161 * continue without it. The dirmangled() routine 1162 * will then be called upon it. 1163 */ 1164 if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { 1165 dnlc_dir_purge(dcap); 1166 break; 1167 } 1168 /* 1169 * Remember the returned offset is the offset of the 1170 * containing record. 1171 */ 1172 slotp->status = FOUND; 1173 slotp->ep = ep; 1174 slotp->offset = offset; 1175 slotp->fbp = fbp; 1176 slotp->size = ep->d_reclen; 1177 /* 1178 * Set end offset to 0. Truncation is handled 1179 * because the dnlc cache will blow away the 1180 * cached directory when an entry is removed 1181 * that drops the entries left to less than half 1182 * the minumum number (dnlc_min_dir_cache). 1183 */ 1184 slotp->endoff = 0; 1185 slotp->cached = 1; 1186 return (0); 1187 case DNOENT: 1188 slotp->status = NONE; 1189 slotp->offset = P2ROUNDUP_TYPED(tdp->i_size, 1190 DIRBLKSIZ, u_offset_t); 1191 slotp->size = DIRBLKSIZ; 1192 slotp->endoff = 0; 1193 slotp->cached = 1; 1194 return (0); 1195 default: 1196 break; 1197 } 1198 break; 1199 } 1200 slotp->cached = 0; 1201 caching = NULL; 1202 if (tdp->i_cachedir && !noentry) { 1203 /* 1204 * Attempt to cache any directories greater than 1205 * the tunable ufs_min_cache_dir. 1206 */ 1207 if (tdp->i_size >= ufs_min_dir_cache) { 1208 switch (dnlc_dir_start(dcap, 1209 tdp->i_size >> AV_DIRECT_SHIFT)) { 1210 case DNOMEM: 1211 case DTOOBIG: 1212 tdp->i_cachedir = 0; 1213 break; 1214 case DOK: 1215 caching = 1; 1216 break; 1217 default: 1218 break; 1219 } 1220 } 1221 } 1222 1223 /* 1224 * No point in using i_diroff since we must search whole directory 1225 */ 1226 dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t); 1227 enduseful = 0; 1228 offset = last_offset = 0; 1229 entryoffsetinblk = 0; 1230 needed = (int)LDIRSIZ(namlen); 1231 while (offset < dirsize) { 1232 /* 1233 * If offset is on a block boundary, 1234 * read the next directory block. 1235 * Release previous if it exists. 1236 */ 1237 if (blkoff(tdp->i_fs, offset) == 0) { 1238 if (fbp != NULL) 1239 fbrelse(fbp, S_OTHER); 1240 1241 err = blkatoff(tdp, offset, (char **)0, &fbp); 1242 if (err) { 1243 ASSERT(*ipp == NULL); 1244 if (caching) { 1245 dnlc_dir_purge(dcap); 1246 } 1247 return (err); 1248 } 1249 entryoffsetinblk = 0; 1250 } 1251 /* 1252 * If still looking for a slot, and at a DIRBLKSIZ 1253 * boundary, have to start looking for free space 1254 * again. 1255 */ 1256 if (slotp->status == NONE && 1257 (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) { 1258 slotp->offset = -1; 1259 } 1260 /* 1261 * If the next entry is a zero length record or if the 1262 * record length is invalid, then skip to the next 1263 * directory block. Complete validation checks are 1264 * done if the record length is invalid. 1265 * 1266 * Full validation checks are slow so they are disabled 1267 * by default. Complete checks can be run by patching 1268 * "dirchk" to be true. 1269 * 1270 * We do not have to check the validity of 1271 * entryoffsetinblk here because it starts out as zero 1272 * and is only incremented by d_reclen values that we 1273 * validate here. 1274 */ 1275 ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); 1276 if (ep->d_reclen == 0 || 1277 (dirchk || (ep->d_reclen & 0x3)) && 1278 dirmangled(tdp, ep, entryoffsetinblk, offset)) { 1279 i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1)); 1280 offset += i; 1281 entryoffsetinblk += i; 1282 if (caching) { 1283 dnlc_dir_purge(dcap); 1284 caching = 0; 1285 } 1286 continue; 1287 } 1288 1289 /* 1290 * Add named entries and free space into the directory cache 1291 */ 1292 if (caching) { 1293 ushort_t extra; 1294 off_t off2; 1295 1296 if (ep->d_ino == 0) { 1297 extra = ep->d_reclen; 1298 if (offset & (DIRBLKSIZ - 1)) { 1299 dnlc_dir_purge(dcap); 1300 caching = 0; 1301 } 1302 } else { 1303 /* 1304 * entries hold the previous offset if 1305 * not the 1st one 1306 */ 1307 if (offset & (DIRBLKSIZ - 1)) { 1308 off2 = last_offset; 1309 } else { 1310 off2 = offset + 1; 1311 } 1312 caching = (dnlc_dir_add_entry(dcap, ep->d_name, 1313 INO_OFF_TO_H(ep->d_ino, off2)) == DOK); 1314 extra = ep->d_reclen - DIRSIZ(ep); 1315 } 1316 if (caching && (extra >= LDIRSIZ(1))) { 1317 caching = (dnlc_dir_add_space(dcap, extra, 1318 (uint64_t)offset) == DOK); 1319 } 1320 } 1321 1322 /* 1323 * If an appropriate sized slot has not yet been found, 1324 * check to see if one is available. 1325 */ 1326 if ((slotp->status != FOUND) && (slotp->status != EXIST)) { 1327 int size = ep->d_reclen; 1328 1329 if (ep->d_ino != 0) 1330 size -= DIRSIZ(ep); 1331 if (size > 0) { 1332 if (size >= needed) { 1333 slotp->offset = offset; 1334 slotp->size = ep->d_reclen; 1335 if (noentry) { 1336 slotp->ep = ep; 1337 slotp->fbp = fbp; 1338 slotp->status = FOUND; 1339 slotp->endoff = 0; 1340 return (0); 1341 } 1342 slotp->status = FOUND; 1343 } else if (slotp->status == NONE) { 1344 if (slotp->offset == -1) 1345 slotp->offset = offset; 1346 } 1347 } 1348 } 1349 /* 1350 * Check for a name match. 1351 */ 1352 if (ep->d_ino && ep->d_namlen == namlen && 1353 *namep == *ep->d_name && /* fast chk 1st char */ 1354 bcmp(namep, ep->d_name, namlen) == 0) { 1355 1356 tdp->i_diroff = offset; 1357 1358 if (tdp->i_number == ep->d_ino) { 1359 *ipp = tdp; /* we want ourself, ie "." */ 1360 VN_HOLD(dvp); 1361 } else { 1362 err = ufs_iget_alloced(tdp->i_vfs, 1363 (ino_t)ep->d_ino, ipp, cr); 1364 if (err) { 1365 fbrelse(fbp, S_OTHER); 1366 if (caching) 1367 dnlc_dir_purge(dcap); 1368 return (err); 1369 } 1370 } 1371 slotp->status = EXIST; 1372 slotp->offset = offset; 1373 slotp->size = (int)(offset - last_offset); 1374 slotp->fbp = fbp; 1375 slotp->ep = ep; 1376 slotp->endoff = 0; 1377 if (caching) 1378 dnlc_dir_purge(dcap); 1379 return (0); 1380 } 1381 last_offset = offset; 1382 offset += ep->d_reclen; 1383 entryoffsetinblk += ep->d_reclen; 1384 if (ep->d_ino) 1385 enduseful = offset; 1386 } 1387 if (fbp) { 1388 fbrelse(fbp, S_OTHER); 1389 } 1390 1391 if (caching) { 1392 dnlc_dir_complete(dcap); 1393 slotp->cached = 1; 1394 if (slotp->status == FOUND) { 1395 if (initstat == FOUND) { 1396 return (0); 1397 } 1398 (void) dnlc_dir_rem_space_by_handle(dcap, 1399 slotp->offset); 1400 slotp->endoff = 0; 1401 return (0); 1402 } 1403 } 1404 1405 if (slotp->status == NONE) { 1406 /* 1407 * We didn't find a slot; the new directory entry should be put 1408 * at the end of the directory. Return an indication of where 1409 * this is, and set "endoff" to zero; since we're going to have 1410 * to extend the directory, we're certainly not going to 1411 * truncate it. 1412 */ 1413 slotp->offset = dirsize; 1414 slotp->size = DIRBLKSIZ; 1415 slotp->endoff = 0; 1416 } else { 1417 /* 1418 * We found a slot, and will return an indication of where that 1419 * slot is, as any new directory entry will be put there. 1420 * Since that slot will become a useful entry, if the last 1421 * useful entry we found was before this one, update the offset 1422 * of the last useful entry. 1423 */ 1424 if (enduseful < slotp->offset + slotp->size) 1425 enduseful = slotp->offset + slotp->size; 1426 slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t); 1427 } 1428 *ipp = NULL; 1429 return (0); 1430 } 1431 1432 uint64_t ufs_dirrename_retry_cnt; 1433 1434 /* 1435 * Rename the entry in the directory tdp so that it points to 1436 * sip instead of tip. 1437 */ 1438 static int 1439 ufs_dirrename( 1440 struct inode *sdp, /* parent directory of source */ 1441 struct inode *sip, /* source inode */ 1442 struct inode *tdp, /* parent directory of target */ 1443 char *namep, /* entry we are trying to change */ 1444 struct inode *tip, /* target inode */ 1445 struct slot *slotp, /* slot for entry */ 1446 struct cred *cr) /* credentials */ 1447 { 1448 vnode_t *tdvp; 1449 off_t offset; 1450 int err; 1451 int doingdirectory; 1452 1453 ASSERT(sdp->i_ufsvfs != NULL); 1454 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1455 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1456 /* 1457 * Short circuit rename of something to itself. 1458 */ 1459 if (sip->i_number == tip->i_number) { 1460 return (ESAME); /* special KLUDGE error code */ 1461 } 1462 1463 /* 1464 * We're locking 2 peer level locks, so must use tryenter 1465 * on the 2nd to avoid deadlocks that would occur 1466 * if we renamed a->b and b->a concurrently. 1467 */ 1468 retry: 1469 rw_enter(&tip->i_contents, RW_WRITER); 1470 if (!rw_tryenter(&sip->i_contents, RW_READER)) { 1471 /* 1472 * drop tip and wait (sleep) until we stand a chance 1473 * of holding sip 1474 */ 1475 rw_exit(&tip->i_contents); 1476 rw_enter(&sip->i_contents, RW_READER); 1477 /* 1478 * Reverse the lock grabs in case we have heavy 1479 * contention on the 2nd lock. 1480 */ 1481 if (!rw_tryenter(&tip->i_contents, RW_WRITER)) { 1482 ufs_dirrename_retry_cnt++; 1483 rw_exit(&sip->i_contents); 1484 goto retry; 1485 } 1486 } 1487 1488 /* 1489 * Check that everything is on the same filesystem. 1490 */ 1491 if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) || 1492 (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) { 1493 err = EXDEV; /* XXX archaic */ 1494 goto out; 1495 } 1496 /* 1497 * Must have write permission to rewrite target entry. 1498 * Perform additional checks for sticky directories. 1499 */ 1500 if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 || 1501 (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0) 1502 goto out; 1503 1504 /* 1505 * Ensure source and target are compatible (both directories 1506 * or both not directories). If target is a directory it must 1507 * be empty and have no links to it; in addition it must not 1508 * be a mount point, and both the source and target must be 1509 * writable. 1510 */ 1511 doingdirectory = (((sip->i_mode & IFMT) == IFDIR) || 1512 ((sip->i_mode & IFMT) == IFATTRDIR)); 1513 if (((tip->i_mode & IFMT) == IFDIR) || 1514 ((tip->i_mode & IFMT) == IFATTRDIR)) { 1515 if (!doingdirectory) { 1516 err = EISDIR; 1517 goto out; 1518 } 1519 /* 1520 * vn_vfslock will prevent mounts from using the directory until 1521 * we are done. 1522 */ 1523 if (vn_vfslock(ITOV(tip))) { 1524 err = EBUSY; 1525 goto out; 1526 } 1527 if (vn_mountedvfs(ITOV(tip)) != NULL) { 1528 vn_vfsunlock(ITOV(tip)); 1529 err = EBUSY; 1530 goto out; 1531 } 1532 if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) { 1533 vn_vfsunlock(ITOV(tip)); 1534 err = EEXIST; /* SIGH should be ENOTEMPTY */ 1535 goto out; 1536 } 1537 } else if (doingdirectory) { 1538 err = ENOTDIR; 1539 goto out; 1540 } 1541 1542 /* 1543 * Rewrite the inode pointer for target name entry 1544 * from the target inode (ip) to the source inode (sip). 1545 * This prevents the target entry from disappearing 1546 * during a crash. Mark the directory inode to reflect the changes. 1547 */ 1548 tdvp = ITOV(tdp); 1549 slotp->ep->d_ino = (int32_t)sip->i_number; 1550 dnlc_update(tdvp, namep, ITOV(sip)); 1551 if (slotp->size) { 1552 offset = slotp->offset - slotp->size; 1553 } else { 1554 offset = slotp->offset + 1; 1555 } 1556 if (slotp->cached) { 1557 (void) dnlc_dir_update(&tdp->i_danchor, namep, 1558 INO_OFF_TO_H(slotp->ep->d_ino, offset)); 1559 } 1560 1561 err = TRANS_DIR(tdp, slotp->offset); 1562 if (err) 1563 fbrelse(slotp->fbp, S_OTHER); 1564 else 1565 err = ufs_fbwrite(slotp->fbp, tdp); 1566 1567 slotp->fbp = NULL; 1568 if (err) { 1569 if (doingdirectory) 1570 vn_vfsunlock(ITOV(tip)); 1571 goto out; 1572 } 1573 1574 TRANS_INODE(tdp->i_ufsvfs, tdp); 1575 tdp->i_flag |= IUPD|ICHG; 1576 tdp->i_seq++; 1577 ITIMES_NOLOCK(tdp); 1578 1579 /* 1580 * Decrement the link count of the target inode. 1581 * Fix the ".." entry in sip to point to dp. 1582 * This is done after the new entry is on the disk. 1583 */ 1584 tip->i_nlink--; 1585 TRANS_INODE(tip->i_ufsvfs, tip); 1586 tip->i_flag |= ICHG; 1587 tip->i_seq++; 1588 ITIMES_NOLOCK(tip); 1589 if (doingdirectory) { 1590 /* 1591 * The entry for tip no longer exists so I can unlock the 1592 * vfslock. 1593 */ 1594 vn_vfsunlock(ITOV(tip)); 1595 /* 1596 * Decrement target link count once more if it was a directory. 1597 */ 1598 if (--tip->i_nlink != 0) { 1599 err = ufs_fault(ITOV(tip), 1600 "ufs_dirrename: target directory link count != 0 (%s)", 1601 tip->i_fs->fs_fsmnt); 1602 rw_exit(&tip->i_contents); 1603 return (err); 1604 } 1605 TRANS_INODE(tip->i_ufsvfs, tip); 1606 ufs_setreclaim(tip); 1607 /* 1608 * Renaming a directory with the parent different 1609 * requires that ".." be rewritten. The window is 1610 * still there for ".." to be inconsistent, but this 1611 * is unavoidable, and a lot shorter than when it was 1612 * done in a user process. We decrement the link 1613 * count in the new parent as appropriate to reflect 1614 * the just-removed target. If the parent is the 1615 * same, this is appropriate since the original 1616 * directory is going away. If the new parent is 1617 * different, ufs_dirfixdotdot() will bump the link count 1618 * back. 1619 */ 1620 tdp->i_nlink--; 1621 ufs_setreclaim(tdp); 1622 TRANS_INODE(tdp->i_ufsvfs, tdp); 1623 tdp->i_flag |= ICHG; 1624 tdp->i_seq++; 1625 ITIMES_NOLOCK(tdp); 1626 if (sdp != tdp) { 1627 rw_exit(&tip->i_contents); 1628 rw_exit(&sip->i_contents); 1629 err = ufs_dirfixdotdot(sip, sdp, tdp); 1630 return (err); 1631 } 1632 } else 1633 ufs_setreclaim(tip); 1634 out: 1635 rw_exit(&tip->i_contents); 1636 rw_exit(&sip->i_contents); 1637 return (err); 1638 } 1639 1640 /* 1641 * Fix the ".." entry of the child directory so that it points 1642 * to the new parent directory instead of the old one. Routine 1643 * assumes that dp is a directory and that all the inodes are on 1644 * the same file system. 1645 */ 1646 static int 1647 ufs_dirfixdotdot( 1648 struct inode *dp, /* child directory */ 1649 struct inode *opdp, /* old parent directory */ 1650 struct inode *npdp) /* new parent directory */ 1651 { 1652 struct fbuf *fbp; 1653 struct dirtemplate *dirp; 1654 vnode_t *dvp; 1655 int err; 1656 1657 ASSERT(RW_WRITE_HELD(&npdp->i_rwlock)); 1658 ASSERT(RW_WRITE_HELD(&npdp->i_contents)); 1659 1660 /* 1661 * We hold the child directory's i_contents lock before calling 1662 * blkatoff so that we honor correct locking protocol which is 1663 * i_contents lock and then page lock. (blkatoff will call 1664 * ufs_getpage where we want the page lock) 1665 * We hold the child directory's i_rwlock before i_contents (as 1666 * per the locking protocol) since we are modifying the ".." entry 1667 * of the child directory. 1668 * We hold the i_rwlock and i_contents lock until we record 1669 * this directory delta to the log (via ufs_trans_dir) and have 1670 * done fbrelse. 1671 */ 1672 rw_enter(&dp->i_rwlock, RW_WRITER); 1673 rw_enter(&dp->i_contents, RW_WRITER); 1674 err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp); 1675 if (err) 1676 goto bad; 1677 1678 if (dp->i_nlink <= 0 || 1679 dp->i_size < sizeof (struct dirtemplate)) { 1680 err = ENOENT; 1681 goto bad; 1682 } 1683 1684 if (dirp->dotdot_namlen != 2 || 1685 dirp->dotdot_name[0] != '.' || 1686 dirp->dotdot_name[1] != '.') { /* Sanity check. */ 1687 dirbad(dp, "mangled .. entry", (off_t)0); 1688 err = ENOTDIR; 1689 goto bad; 1690 } 1691 1692 /* 1693 * Increment the link count in the new parent inode and force it out. 1694 */ 1695 if (npdp->i_nlink == MAXLINK) { 1696 err = EMLINK; 1697 goto bad; 1698 } 1699 npdp->i_nlink++; 1700 TRANS_INODE(npdp->i_ufsvfs, npdp); 1701 npdp->i_flag |= ICHG; 1702 npdp->i_seq++; 1703 ufs_iupdat(npdp, I_SYNC); 1704 1705 /* 1706 * Rewrite the child ".." entry and force it out. 1707 */ 1708 dvp = ITOV(dp); 1709 dirp->dotdot_ino = (uint32_t)npdp->i_number; 1710 dnlc_update(dvp, "..", ITOV(npdp)); 1711 (void) dnlc_dir_update(&dp->i_danchor, "..", 1712 INO_OFF_TO_H(dirp->dotdot_ino, 0)); 1713 1714 err = TRANS_DIR(dp, 0); 1715 if (err) 1716 fbrelse(fbp, S_OTHER); 1717 else 1718 err = ufs_fbwrite(fbp, dp); 1719 1720 fbp = NULL; 1721 if (err) 1722 goto bad; 1723 1724 rw_exit(&dp->i_contents); 1725 rw_exit(&dp->i_rwlock); 1726 1727 /* 1728 * Decrement the link count of the old parent inode and force it out. 1729 */ 1730 ASSERT(opdp); 1731 rw_enter(&opdp->i_contents, RW_WRITER); 1732 ASSERT(opdp->i_nlink > 0); 1733 opdp->i_nlink--; 1734 ufs_setreclaim(opdp); 1735 TRANS_INODE(opdp->i_ufsvfs, opdp); 1736 opdp->i_flag |= ICHG; 1737 opdp->i_seq++; 1738 ufs_iupdat(opdp, I_SYNC); 1739 rw_exit(&opdp->i_contents); 1740 return (0); 1741 1742 bad: 1743 if (fbp) 1744 fbrelse(fbp, S_OTHER); 1745 rw_exit(&dp->i_contents); 1746 rw_exit(&dp->i_rwlock); 1747 return (err); 1748 } 1749 1750 /* 1751 * Enter the file sip in the directory tdp with name namep. 1752 */ 1753 static int 1754 ufs_diraddentry( 1755 struct inode *tdp, 1756 char *namep, 1757 enum de_op op, 1758 int namlen, 1759 struct slot *slotp, 1760 struct inode *sip, 1761 struct inode *sdp, 1762 struct cred *cr) 1763 { 1764 struct direct *ep, *nep; 1765 vnode_t *tdvp; 1766 dcanchor_t *dcap = &tdp->i_danchor; 1767 off_t offset; 1768 int err; 1769 ushort_t extra; 1770 1771 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 1772 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 1773 /* 1774 * Prepare a new entry. If the caller has not supplied an 1775 * existing inode, make a new one. 1776 */ 1777 err = dirprepareentry(tdp, slotp, cr); 1778 if (err) { 1779 if (slotp->fbp) { 1780 fbrelse(slotp->fbp, S_OTHER); 1781 slotp->fbp = NULL; 1782 } 1783 return (err); 1784 } 1785 /* 1786 * Check inode to be linked to see if it is in the 1787 * same filesystem. 1788 */ 1789 if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) { 1790 err = EXDEV; 1791 goto bad; 1792 } 1793 1794 /* 1795 * If renaming a directory then fix up the ".." entry in the 1796 * directory to point to the new parent. 1797 */ 1798 if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) || 1799 ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) { 1800 err = ufs_dirfixdotdot(sip, sdp, tdp); 1801 if (err) 1802 goto bad; 1803 } 1804 1805 /* 1806 * Fill in entry data. 1807 */ 1808 ep = slotp->ep; 1809 ep->d_namlen = (ushort_t)namlen; 1810 (void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3)); 1811 ep->d_ino = (uint32_t)sip->i_number; 1812 tdvp = ITOV(tdp); 1813 dnlc_update(tdvp, namep, ITOV(sip)); 1814 /* 1815 * Note the offset supplied for any named entry is 1816 * the offset of the previous one, unless it's the 1st. 1817 * slotp->size is used to pass the length to 1818 * the previous entry. 1819 */ 1820 if (slotp->size) { 1821 offset = slotp->offset - slotp->size; 1822 } else { 1823 offset = slotp->offset + 1; 1824 } 1825 1826 if (slotp->cached) { 1827 /* 1828 * Add back any usable unused space to the dnlc directory 1829 * cache. 1830 */ 1831 extra = ep->d_reclen - DIRSIZ(ep); 1832 if (extra >= LDIRSIZ(1)) { 1833 (void) dnlc_dir_add_space(dcap, extra, 1834 (uint64_t)slotp->offset); 1835 } 1836 1837 (void) dnlc_dir_add_entry(dcap, namep, 1838 INO_OFF_TO_H(ep->d_ino, offset)); 1839 1840 /* adjust the previous offset of the next entry */ 1841 nep = (struct direct *)((char *)ep + ep->d_reclen); 1842 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 1843 /* 1844 * Not a new block. 1845 * 1846 * Check the validity of the next entry. 1847 * If it's bad, then throw away the cache, and 1848 * continue as before directory caching. 1849 */ 1850 if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || 1851 dnlc_dir_update(dcap, nep->d_name, 1852 INO_OFF_TO_H(nep->d_ino, slotp->offset)) 1853 == DNOENT) { 1854 dnlc_dir_purge(dcap); 1855 slotp->cached = 0; 1856 } 1857 } 1858 } 1859 1860 /* 1861 * Write out the directory block. 1862 */ 1863 err = TRANS_DIR(tdp, slotp->offset); 1864 if (err) 1865 fbrelse(slotp->fbp, S_OTHER); 1866 else 1867 err = ufs_fbwrite(slotp->fbp, tdp); 1868 1869 slotp->fbp = NULL; 1870 /* 1871 * If this is a rename of a directory, then we have already 1872 * fixed the ".." entry to refer to the new parent. If err 1873 * is true at this point, we have failed to update the new 1874 * parent to refer to the renamed directory. 1875 * XXX - we need to unwind the ".." fix. 1876 */ 1877 if (err) 1878 return (err); 1879 1880 /* 1881 * Mark the directory inode to reflect the changes. 1882 * Truncate the directory to chop off blocks of empty entries. 1883 */ 1884 1885 TRANS_INODE(tdp->i_ufsvfs, tdp); 1886 tdp->i_flag |= IUPD|ICHG; 1887 tdp->i_seq++; 1888 tdp->i_diroff = 0; 1889 ITIMES_NOLOCK(tdp); 1890 /* 1891 * If the directory grew then dirprepareentry() will have 1892 * set IATTCHG in tdp->i_flag, then the directory inode must 1893 * be flushed out. This is because if fsync() is used later 1894 * the directory size must be correct, otherwise a crash would 1895 * cause fsck to move the file to lost+found. Also because later 1896 * a file may be linked in more than one directory, then there 1897 * is no way to flush the original directory. So it must be 1898 * flushed out on creation. See bug 4293809. 1899 */ 1900 if (tdp->i_flag & IATTCHG) { 1901 ufs_iupdat(tdp, I_SYNC); 1902 } 1903 1904 if (slotp->endoff && (slotp->endoff < tdp->i_size)) { 1905 if (!TRANS_ISTRANS(tdp->i_ufsvfs)) { 1906 (void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0, 1907 cr); 1908 } 1909 } 1910 1911 1912 return (0); 1913 1914 bad: 1915 if (slotp->cached) { 1916 dnlc_dir_purge(dcap); 1917 fbrelse(slotp->fbp, S_OTHER); 1918 slotp->cached = 0; 1919 slotp->fbp = NULL; 1920 return (err); 1921 } 1922 1923 /* 1924 * Clear out entry prepared by dirprepareent. 1925 */ 1926 slotp->ep->d_ino = 0; 1927 slotp->ep->d_namlen = 0; 1928 1929 /* 1930 * Don't touch err so we don't clobber the real error that got us here. 1931 */ 1932 if (TRANS_DIR(tdp, slotp->offset)) 1933 fbrelse(slotp->fbp, S_OTHER); 1934 else 1935 (void) ufs_fbwrite(slotp->fbp, tdp); 1936 slotp->fbp = NULL; 1937 return (err); 1938 } 1939 1940 /* 1941 * Prepare a directory slot to receive an entry. 1942 */ 1943 static int 1944 dirprepareentry( 1945 struct inode *dp, /* directory we are working in */ 1946 struct slot *slotp, /* available slot info */ 1947 struct cred *cr) 1948 { 1949 struct direct *ep, *nep; 1950 off_t entryend; 1951 int err; 1952 slotstat_t status = slotp->status; 1953 ushort_t dsize; 1954 1955 ASSERT((status == NONE) || (status == FOUND)); 1956 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 1957 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 1958 /* 1959 * If we didn't find a slot, then indicate that the 1960 * new slot belongs at the end of the directory. 1961 * If we found a slot, then the new entry can be 1962 * put at slotp->offset. 1963 */ 1964 entryend = slotp->offset + slotp->size; 1965 if (status == NONE) { 1966 ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0); 1967 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 1968 err = ufs_fault(ITOV(dp), 1969 "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d" 1970 " > dp->i_fs->fs_fsize: %d (%s)", 1971 DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt); 1972 return (err); 1973 } 1974 /* 1975 * Allocate the new block. 1976 */ 1977 err = BMAPALLOC(dp, (u_offset_t)slotp->offset, 1978 (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr); 1979 if (err) { 1980 return (err); 1981 } 1982 dp->i_size = entryend; 1983 TRANS_INODE(dp->i_ufsvfs, dp); 1984 dp->i_flag |= IUPD|ICHG|IATTCHG; 1985 dp->i_seq++; 1986 ITIMES_NOLOCK(dp); 1987 } else if (entryend > dp->i_size) { 1988 /* 1989 * Adjust directory size, if needed. This should never 1990 * push the size past a new multiple of DIRBLKSIZ. 1991 * This is an artifact of the old (4.2BSD) way of initializing 1992 * directory sizes to be less than DIRBLKSIZ. 1993 */ 1994 dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t); 1995 TRANS_INODE(dp->i_ufsvfs, dp); 1996 dp->i_flag |= IUPD|ICHG|IATTCHG; 1997 dp->i_seq++; 1998 ITIMES_NOLOCK(dp); 1999 } 2000 2001 /* 2002 * Get the block containing the space for the new directory entry. 2003 */ 2004 if (slotp->fbp == NULL) { 2005 err = blkatoff(dp, slotp->offset, (char **)&slotp->ep, 2006 &slotp->fbp); 2007 if (err) { 2008 return (err); 2009 } 2010 } 2011 ep = slotp->ep; 2012 2013 switch (status) { 2014 case NONE: 2015 /* 2016 * No space in the directory. slotp->offset will be on a 2017 * directory block boundary and we will write the new entry 2018 * into a fresh block. 2019 */ 2020 ep->d_reclen = DIRBLKSIZ; 2021 slotp->size = 0; /* length of previous entry */ 2022 break; 2023 case FOUND: 2024 /* 2025 * An entry of the required size has been found. Use it. 2026 */ 2027 if (ep->d_ino == 0) { 2028 /* this is the 1st record in a block */ 2029 slotp->size = 0; /* length of previous entry */ 2030 } else { 2031 dsize = DIRSIZ(ep); 2032 nep = (struct direct *)((char *)ep + dsize); 2033 nep->d_reclen = ep->d_reclen - dsize; 2034 ep->d_reclen = dsize; 2035 slotp->ep = nep; 2036 slotp->offset += dsize; 2037 slotp->size = dsize; /* length of previous entry */ 2038 } 2039 break; 2040 default: 2041 break; 2042 } 2043 return (0); 2044 } 2045 2046 /* 2047 * Allocate and initialize a new inode that will go into directory tdp. 2048 * This routine is called from ufs_symlink(), as well as within this file. 2049 */ 2050 int 2051 ufs_dirmakeinode( 2052 struct inode *tdp, 2053 struct inode **ipp, 2054 struct vattr *vap, 2055 enum de_op op, 2056 struct cred *cr) 2057 { 2058 struct inode *ip; 2059 enum vtype type; 2060 int imode; /* mode and format as in inode */ 2061 ino_t ipref; 2062 int err; 2063 timestruc_t now; 2064 2065 ASSERT(vap != NULL); 2066 ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR || 2067 op == DE_SYMLINK); 2068 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); 2069 ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); 2070 ASSERT(RW_WRITE_HELD(&tdp->i_contents)); 2071 /* 2072 * Allocate a new inode. 2073 */ 2074 type = vap->va_type; 2075 if (type == VDIR) { 2076 ipref = dirpref(tdp); 2077 } else { 2078 ipref = tdp->i_number; 2079 } 2080 if (op == DE_ATTRDIR) 2081 imode = vap->va_mode; 2082 else 2083 imode = MAKEIMODE(type, vap->va_mode); 2084 *ipp = NULL; 2085 err = ufs_ialloc(tdp, ipref, imode, &ip, cr); 2086 if (err) 2087 return (err); 2088 2089 /* 2090 * We don't need to grab vfs_dqrwlock here because it is held 2091 * in ufs_direnter_*() above us. 2092 */ 2093 ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock)); 2094 rw_enter(&ip->i_contents, RW_WRITER); 2095 if (ip->i_dquot != NULL) { 2096 err = ufs_fault(ITOV(ip), 2097 "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)", 2098 tdp->i_fs->fs_fsmnt); 2099 rw_exit(&ip->i_contents); 2100 return (err); 2101 } 2102 *ipp = ip; 2103 ip->i_mode = (o_mode_t)imode; 2104 if (type == VBLK || type == VCHR) { 2105 dev_t d = vap->va_rdev; 2106 dev32_t dev32; 2107 2108 /* 2109 * Don't allow a special file to be created with a 2110 * dev_t that cannot be represented by this filesystem 2111 * format on disk. 2112 */ 2113 if (!cmpldev(&dev32, d)) { 2114 err = EOVERFLOW; 2115 goto fail; 2116 } 2117 2118 ITOV(ip)->v_rdev = ip->i_rdev = d; 2119 2120 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 2121 ip->i_ordev = dev32; /* can't use old format */ 2122 } else { 2123 ip->i_ordev = cmpdev(d); 2124 } 2125 } 2126 ITOV(ip)->v_type = type; 2127 ufs_reset_vnode(ip->i_vnode); 2128 if (type == VDIR) { 2129 ip->i_nlink = 2; /* anticipating a call to dirmakedirect */ 2130 } else { 2131 ip->i_nlink = 1; 2132 } 2133 2134 if (op == DE_ATTRDIR) { 2135 ip->i_uid = vap->va_uid; 2136 ip->i_gid = vap->va_gid; 2137 } else 2138 ip->i_uid = crgetuid(cr); 2139 /* 2140 * To determine the group-id of the created file: 2141 * 1) If the gid is set in the attribute list (non-Sun & pre-4.0 2142 * clients are not likely to set the gid), then use it if 2143 * the process is privileged, belongs to the target group, 2144 * or the group is the same as the parent directory. 2145 * 2) If the filesystem was not mounted with the Old-BSD-compatible 2146 * GRPID option, and the directory's set-gid bit is clear, 2147 * then use the process's gid. 2148 * 3) Otherwise, set the group-id to the gid of the parent directory. 2149 */ 2150 if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) && 2151 ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) || 2152 secpolicy_vnode_create_gid(cr) == 0)) { 2153 /* 2154 * XXX - is this only the case when a 4.0 NFS client, or a 2155 * client derived from that code, makes a call over the wire? 2156 */ 2157 ip->i_gid = vap->va_gid; 2158 } else 2159 ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr); 2160 2161 /* 2162 * For SunOS 5.0->5.4, the lines below read: 2163 * 2164 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 2165 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 2166 * 2167 * where MAXUID was set to 60002. See notes on this in ufs_inode.c 2168 */ 2169 ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 2170 UID_LONG : ip->i_uid; 2171 ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 2172 GID_LONG : ip->i_gid; 2173 2174 /* 2175 * If we're creating a directory, and the parent directory has the 2176 * set-GID bit set, set it on the new directory. 2177 * Otherwise, if the user is neither privileged nor a member of the 2178 * file's new group, clear the file's set-GID bit. 2179 */ 2180 if ((tdp->i_mode & ISGID) && (type == VDIR)) 2181 ip->i_mode |= ISGID; 2182 else { 2183 if ((ip->i_mode & ISGID) && 2184 secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0) 2185 ip->i_mode &= ~ISGID; 2186 } 2187 2188 if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || 2189 ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { 2190 err = EOVERFLOW; 2191 goto fail; 2192 } 2193 2194 /* 2195 * Extended attribute directories are not subject to quotas. 2196 */ 2197 if (op != DE_ATTRDIR) 2198 ip->i_dquot = getinoquota(ip); 2199 else 2200 ip->i_dquot = NULL; 2201 2202 if (op == DE_MKDIR || op == DE_ATTRDIR) { 2203 err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr); 2204 if (err) 2205 goto fail; 2206 } 2207 2208 /* 2209 * generate the shadow inode and attach it to the new object 2210 */ 2211 ASSERT((tdp->i_shadow && tdp->i_ufs_acl) || 2212 (!tdp->i_shadow && !tdp->i_ufs_acl)); 2213 if (tdp->i_shadow && tdp->i_ufs_acl && 2214 (((tdp->i_mode & IFMT) == IFDIR) || 2215 ((tdp->i_mode & IFMT) == IFATTRDIR))) { 2216 err = ufs_si_inherit(ip, tdp, ip->i_mode, cr); 2217 if (err) { 2218 if (op == DE_MKDIR) { 2219 /* 2220 * clean up parent directory 2221 * 2222 * tdp->i_contents already locked from 2223 * ufs_direnter_*() 2224 */ 2225 tdp->i_nlink--; 2226 TRANS_INODE(tdp->i_ufsvfs, tdp); 2227 tdp->i_flag |= ICHG; 2228 tdp->i_seq++; 2229 ufs_iupdat(tdp, I_SYNC); 2230 } 2231 goto fail; 2232 } 2233 } 2234 2235 /* 2236 * If the passed in attributes contain atime and/or mtime 2237 * settings, then use them instead of using the current 2238 * high resolution time. 2239 */ 2240 if (vap->va_mask & (AT_MTIME|AT_ATIME)) { 2241 if (vap->va_mask & AT_ATIME) { 2242 ip->i_atime.tv_sec = vap->va_atime.tv_sec; 2243 ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; 2244 ip->i_flag &= ~IACC; 2245 } else 2246 ip->i_flag |= IACC; 2247 if (vap->va_mask & AT_MTIME) { 2248 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; 2249 ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; 2250 gethrestime(&now); 2251 if (now.tv_sec > TIME32_MAX) { 2252 /* 2253 * In 2038, ctime sticks forever.. 2254 */ 2255 ip->i_ctime.tv_sec = TIME32_MAX; 2256 ip->i_ctime.tv_usec = 0; 2257 } else { 2258 ip->i_ctime.tv_sec = now.tv_sec; 2259 ip->i_ctime.tv_usec = now.tv_nsec / 1000; 2260 } 2261 ip->i_flag &= ~(IUPD|ICHG); 2262 ip->i_flag |= IMODTIME; 2263 } else 2264 ip->i_flag |= IUPD|ICHG; 2265 ip->i_flag |= IMOD; 2266 } else 2267 ip->i_flag |= IACC|IUPD|ICHG; 2268 ip->i_seq++; 2269 2270 /* 2271 * If this is an attribute tag it as one. 2272 */ 2273 if ((tdp->i_mode & IFMT) == IFATTRDIR) { 2274 ip->i_cflags |= IXATTR; 2275 } 2276 2277 /* 2278 * push inode before it's name appears in a directory 2279 */ 2280 TRANS_INODE(ip->i_ufsvfs, ip); 2281 ufs_iupdat(ip, I_SYNC); 2282 rw_exit(&ip->i_contents); 2283 return (0); 2284 2285 fail: 2286 /* Throw away inode we just allocated. */ 2287 ip->i_nlink = 0; 2288 ufs_setreclaim(ip); 2289 TRANS_INODE(ip->i_ufsvfs, ip); 2290 ip->i_flag |= ICHG; 2291 ip->i_seq++; 2292 ITIMES_NOLOCK(ip); 2293 rw_exit(&ip->i_contents); 2294 return (err); 2295 } 2296 2297 /* 2298 * Write a prototype directory into the empty inode ip, whose parent is dp. 2299 */ 2300 static int 2301 ufs_dirmakedirect( 2302 struct inode *ip, /* new directory */ 2303 struct inode *dp, /* parent directory */ 2304 int attrdir, 2305 struct cred *cr) 2306 { 2307 struct dirtemplate *dirp; 2308 struct fbuf *fbp; 2309 int err; 2310 2311 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 2312 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2313 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2314 /* 2315 * Allocate space for the directory we're creating. 2316 */ 2317 err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr); 2318 if (err) 2319 return (err); 2320 if (DIRBLKSIZ > dp->i_fs->fs_fsize) { 2321 err = ufs_fault(ITOV(dp), 2322 "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)", 2323 DIRBLKSIZ, dp->i_fs->fs_fsize, 2324 dp->i_fs->fs_fsmnt); 2325 return (err); 2326 } 2327 ip->i_size = DIRBLKSIZ; 2328 TRANS_INODE(ip->i_ufsvfs, ip); 2329 ip->i_flag |= IUPD|ICHG|IATTCHG; 2330 ip->i_seq++; 2331 ITIMES_NOLOCK(ip); 2332 /* 2333 * Update the tdp link count and write out the change. 2334 * This reflects the ".." entry we'll soon write. 2335 */ 2336 if (dp->i_nlink == MAXLINK) 2337 return (EMLINK); 2338 if (attrdir == 0) 2339 dp->i_nlink++; 2340 TRANS_INODE(dp->i_ufsvfs, dp); 2341 dp->i_flag |= ICHG; 2342 dp->i_seq++; 2343 ufs_iupdat(dp, I_SYNC); 2344 /* 2345 * Initialize directory with "." 2346 * and ".." from static template. 2347 * 2348 * Since the parent directory is locked, we don't have to 2349 * worry about anything changing when we drop the write 2350 * lock on (ip). 2351 * 2352 */ 2353 err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize, 2354 S_READ, &fbp); 2355 2356 if (err) { 2357 goto fail; 2358 } 2359 dirp = (struct dirtemplate *)fbp->fb_addr; 2360 /* 2361 * Now initialize the directory we're creating 2362 * with the "." and ".." entries. 2363 */ 2364 *dirp = mastertemplate; /* structure assignment */ 2365 dirp->dot_ino = (uint32_t)ip->i_number; 2366 dirp->dotdot_ino = (uint32_t)dp->i_number; 2367 2368 err = TRANS_DIR(ip, 0); 2369 if (err) { 2370 fbrelse(fbp, S_OTHER); 2371 goto fail; 2372 } 2373 2374 err = ufs_fbwrite(fbp, ip); 2375 if (err) { 2376 goto fail; 2377 } 2378 2379 return (0); 2380 2381 fail: 2382 if (attrdir == 0) 2383 dp->i_nlink--; 2384 TRANS_INODE(dp->i_ufsvfs, dp); 2385 dp->i_flag |= ICHG; 2386 dp->i_seq++; 2387 ufs_iupdat(dp, I_SYNC); 2388 return (err); 2389 } 2390 2391 /* 2392 * Delete a directory entry. If oip is nonzero the entry is checked 2393 * to make sure it still reflects oip. 2394 * 2395 * If vpp is non-null, return the ptr of the (held) vnode associated with 2396 * the removed name. The caller is responsible for doing the VN_RELE(). 2397 */ 2398 int 2399 ufs_dirremove( 2400 struct inode *dp, 2401 char *namep, 2402 struct inode *oip, 2403 struct vnode *cdir, 2404 enum dr_op op, 2405 struct cred *cr, 2406 vnode_t **vpp) /* Return (held) vnode ptr of removed file/dir */ 2407 { 2408 struct direct *ep, *pep, *nep; 2409 struct inode *ip; 2410 vnode_t *dvp, *vp; 2411 struct slot slot; 2412 int namlen; 2413 int err; 2414 int mode; 2415 ushort_t extra; 2416 2417 namlen = (int)strlen(namep); 2418 if (namlen == 0) 2419 return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0")); 2420 /* 2421 * return error when removing . and .. 2422 */ 2423 if (namep[0] == '.') { 2424 if (namlen == 1) 2425 return (EINVAL); 2426 else if (namlen == 2 && namep[1] == '.') { 2427 return (EEXIST); /* SIGH should be ENOTEMPTY */ 2428 } 2429 } 2430 2431 ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); 2432 /* 2433 * Check accessibility of directory. 2434 */ 2435 retry: 2436 if (((dp->i_mode & IFMT) != IFDIR) && 2437 ((dp->i_mode & IFMT) != IFATTRDIR)) { 2438 return (ENOTDIR); 2439 } 2440 2441 /* 2442 * Execute access is required to search the directory. 2443 * Access for write is interpreted as allowing 2444 * deletion of files in the directory. 2445 */ 2446 if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) { 2447 return (err); 2448 } 2449 2450 ip = NULL; 2451 slot.fbp = NULL; 2452 slot.status = FOUND; /* don't need to look for empty slot */ 2453 rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); 2454 rw_enter(&dp->i_contents, RW_WRITER); 2455 err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0); 2456 if (err) 2457 goto out_novfs; 2458 if (ip == NULL) { 2459 err = ENOENT; 2460 goto out_novfs; 2461 } 2462 vp = ITOV(ip); 2463 if (oip && oip != ip) { 2464 err = ENOENT; 2465 goto out_novfs; 2466 } 2467 2468 mode = ip->i_mode & IFMT; 2469 if (mode == IFDIR || mode == IFATTRDIR) { 2470 2471 /* 2472 * vn_vfslock() prevents races between mount and rmdir. 2473 */ 2474 if (vn_vfslock(vp)) { 2475 err = EBUSY; 2476 goto out_novfs; 2477 } 2478 if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) { 2479 err = EBUSY; 2480 goto out; 2481 } 2482 /* 2483 * If we are removing a directory, get a lock on it. 2484 * Taking a writer lock prevents a parallel ufs_dirlook from 2485 * incorrectly entering a negative cache vnode entry in the dnlc 2486 * If the directory is empty, it will stay empty until 2487 * we can remove it. 2488 */ 2489 if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) { 2490 /* 2491 * It is possible that a thread in rename would have 2492 * acquired this rwlock. To prevent a deadlock we 2493 * do a rw_tryenter. If we fail to get the lock 2494 * we drop all the locks we have acquired, wait 2495 * for 2 ticks and reacquire the 2496 * directory's (dp) i_rwlock and try again. 2497 * If we dont drop dp's i_rwlock then we will panic 2498 * with a "Deadlock: cycle in blocking chain" 2499 * since in ufs_dircheckpath we want dp's i_rwlock. 2500 * dp is guaranteed to exist since ufs_dirremove is 2501 * called after a VN_HOLD(dp) has been done. 2502 */ 2503 ufs_dirremove_retry_cnt++; 2504 vn_vfsunlock(vp); 2505 if (slot.fbp) 2506 fbrelse(slot.fbp, S_OTHER); 2507 rw_exit(&dp->i_contents); 2508 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2509 rw_exit(&dp->i_rwlock); 2510 VN_RELE(vp); 2511 delay(2); 2512 rw_enter(&dp->i_rwlock, RW_WRITER); 2513 goto retry; 2514 } 2515 } 2516 rw_enter(&ip->i_contents, RW_READER); 2517 2518 /* 2519 * Now check the restrictions that apply on sticky directories. 2520 */ 2521 if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) { 2522 rw_exit(&ip->i_contents); 2523 if (mode == IFDIR || mode == IFATTRDIR) 2524 rw_exit(&ip->i_rwlock); 2525 goto out; 2526 } 2527 2528 if (op == DR_RMDIR) { 2529 /* 2530 * For rmdir(2), some special checks are required. 2531 * (a) Don't remove any alias of the parent (e.g. "."). 2532 * (b) Don't remove the current directory. 2533 * (c) Make sure the entry is (still) a directory. 2534 * (d) Make sure the directory is empty. 2535 */ 2536 2537 if (dp == ip || vp == cdir) 2538 err = EINVAL; 2539 else if (((ip->i_mode & IFMT) != IFDIR) && 2540 ((ip->i_mode & IFMT) != IFATTRDIR)) 2541 err = ENOTDIR; 2542 else if ((ip->i_nlink > 2) || 2543 !ufs_dirempty(ip, dp->i_number, cr)) { 2544 err = EEXIST; /* SIGH should be ENOTEMPTY */ 2545 } 2546 2547 if (err) { 2548 rw_exit(&ip->i_contents); 2549 if (mode == IFDIR || mode == IFATTRDIR) 2550 rw_exit(&ip->i_rwlock); 2551 goto out; 2552 } 2553 } else if (op == DR_REMOVE) { 2554 /* 2555 * unlink(2) requires a different check: allow only 2556 * privileged users to unlink a directory. 2557 */ 2558 if (vp->v_type == VDIR && 2559 secpolicy_fs_linkdir(cr, vp->v_vfsp)) { 2560 err = EPERM; 2561 rw_exit(&ip->i_contents); 2562 rw_exit(&ip->i_rwlock); 2563 goto out; 2564 } 2565 } 2566 2567 rw_exit(&ip->i_contents); 2568 2569 /* 2570 * Remove the cache'd entry, if any. 2571 */ 2572 dvp = ITOV(dp); 2573 dnlc_remove(dvp, namep); 2574 ep = slot.ep; 2575 ep->d_ino = 0; 2576 2577 if (slot.cached) { 2578 dcanchor_t *dcap = &dp->i_danchor; 2579 2580 (void) dnlc_dir_rem_entry(dcap, namep, NULL); 2581 if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) { 2582 (void) dnlc_dir_rem_space_by_handle(dcap, slot.offset); 2583 } 2584 if (slot.offset & (DIRBLKSIZ - 1)) { 2585 /* 2586 * Collapse new free space into previous entry. 2587 * Note, the previous entry has already been 2588 * validated in ufs_dircheckforname(). 2589 */ 2590 ASSERT(slot.size); 2591 pep = (struct direct *)((char *)ep - slot.size); 2592 if ((pep->d_ino == 0) && 2593 ((uintptr_t)pep & (DIRBLKSIZ - 1))) { 2594 dnlc_dir_purge(dcap); 2595 slot.cached = 0; 2596 goto nocache; 2597 } 2598 if (pep->d_ino) { 2599 extra = pep->d_reclen - DIRSIZ(pep); 2600 } else { 2601 extra = pep->d_reclen; 2602 } 2603 if (extra >= LDIRSIZ(1)) { 2604 (void) dnlc_dir_rem_space_by_handle(dcap, 2605 (uint64_t)(slot.offset - slot.size)); 2606 } 2607 pep->d_reclen += ep->d_reclen; 2608 (void) dnlc_dir_add_space(dcap, extra + ep->d_reclen, 2609 (uint64_t)(slot.offset - slot.size)); 2610 /* adjust the previous pointer in the next entry */ 2611 nep = (struct direct *)((char *)ep + ep->d_reclen); 2612 if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { 2613 /* 2614 * Not a new block. 2615 * 2616 * Check the validity of the entry. 2617 * If it's bad, then throw away the cache and 2618 * continue. 2619 */ 2620 if ((nep->d_reclen == 0) || 2621 (nep->d_reclen & 0x3) || 2622 (dnlc_dir_update(dcap, nep->d_name, 2623 INO_OFF_TO_H(nep->d_ino, 2624 slot.offset - slot.size)) == DNOENT)) { 2625 dnlc_dir_purge(dcap); 2626 slot.cached = 0; 2627 } 2628 } 2629 } else { 2630 (void) dnlc_dir_add_space(dcap, ep->d_reclen, 2631 (uint64_t)slot.offset); 2632 } 2633 } else { 2634 /* 2635 * If the entry isn't the first in the directory, we must 2636 * reclaim the space of the now empty record by adding 2637 * the record size to the size of the previous entry. 2638 */ 2639 if (slot.offset & (DIRBLKSIZ - 1)) { 2640 /* 2641 * Collapse new free space into previous entry. 2642 */ 2643 pep = (struct direct *)((char *)ep - slot.size); 2644 pep->d_reclen += ep->d_reclen; 2645 } 2646 } 2647 nocache: 2648 2649 2650 err = TRANS_DIR(dp, slot.offset); 2651 if (err) 2652 fbrelse(slot.fbp, S_OTHER); 2653 else 2654 err = ufs_fbwrite(slot.fbp, dp); 2655 slot.fbp = NULL; 2656 2657 /* 2658 * If we were removing a directory, it is 'gone' now, but we cannot 2659 * unlock it as a thread may be waiting for the lock in ufs_create. If 2660 * we did, it could then create a file in a deleted directory. 2661 */ 2662 2663 if (err) { 2664 if (mode == IFDIR || mode == IFATTRDIR) 2665 rw_exit(&ip->i_rwlock); 2666 goto out; 2667 } 2668 2669 rw_enter(&ip->i_contents, RW_WRITER); 2670 2671 dp->i_flag |= IUPD|ICHG; 2672 dp->i_seq++; 2673 ip->i_flag |= ICHG; 2674 ip->i_seq++; 2675 2676 TRANS_INODE(dp->i_ufsvfs, dp); 2677 TRANS_INODE(ip->i_ufsvfs, ip); 2678 /* 2679 * Now dispose of the inode. 2680 */ 2681 if (ip->i_nlink > 0) { 2682 /* 2683 * This is not done for IFATTRDIR's because they don't 2684 * have entries in the dnlc and the link counts are 2685 * not incremented when they are created. 2686 */ 2687 if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) { 2688 /* 2689 * Decrement by 2 because we're trashing the "." 2690 * entry as well as removing the entry in dp. 2691 * Clear the directory entry, but there may be 2692 * other hard links so don't free the inode. 2693 * Decrement the dp linkcount because we're 2694 * trashing the ".." entry. 2695 */ 2696 ip->i_nlink -= 2; 2697 dp->i_nlink--; 2698 ufs_setreclaim(dp); 2699 /* 2700 * XXX need to discard negative cache entries 2701 * for vp. See comment in ufs_delete(). 2702 */ 2703 dnlc_remove(vp, "."); 2704 dnlc_remove(vp, ".."); 2705 /* 2706 * The return value is ignored here bacause if 2707 * the directory purge fails we don't want to 2708 * stop the delete. If ufs_dirpurgedotdot fails 2709 * the delete will continue with the preexiting 2710 * behavior. 2711 */ 2712 (void) ufs_dirpurgedotdot(ip, dp->i_number, cr); 2713 } else { 2714 ip->i_nlink--; 2715 } 2716 ufs_setreclaim(ip); 2717 } 2718 ITIMES_NOLOCK(dp); 2719 ITIMES_NOLOCK(ip); 2720 2721 if (!TRANS_ISTRANS(dp->i_ufsvfs)) 2722 ufs_iupdat(dp, I_SYNC); 2723 if (!TRANS_ISTRANS(ip->i_ufsvfs)) 2724 ufs_iupdat(ip, I_SYNC); 2725 2726 rw_exit(&ip->i_contents); 2727 if (mode == IFDIR || mode == IFATTRDIR) 2728 rw_exit(&ip->i_rwlock); 2729 out: 2730 if (mode == IFDIR || mode == IFATTRDIR) { 2731 vn_vfsunlock(vp); 2732 } 2733 out_novfs: 2734 ASSERT(RW_WRITE_HELD(&dp->i_contents)); 2735 2736 if (slot.fbp) 2737 fbrelse(slot.fbp, S_OTHER); 2738 2739 rw_exit(&dp->i_contents); 2740 rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); 2741 2742 /* 2743 * If no error and vpp is non-NULL, return the vnode ptr to the caller. 2744 * The caller becomes responsible for the VN_RELE(). Otherwise, 2745 * Release (and delete) the inode after we drop vfs_dqrwlock to 2746 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. 2747 */ 2748 if (ip) { 2749 if ((err == 0) && (vpp != NULL)) { 2750 *vpp = ITOV(ip); 2751 } else { 2752 VN_RELE(vp); 2753 } 2754 } 2755 2756 return (err); 2757 } 2758 2759 /* 2760 * Return buffer with contents of block "offset" 2761 * from the beginning of directory "ip". If "res" 2762 * is non-zero, fill it in with a pointer to the 2763 * remaining space in the directory. 2764 * 2765 */ 2766 2767 int 2768 blkatoff( 2769 struct inode *ip, 2770 off_t offset, 2771 char **res, 2772 struct fbuf **fbpp) 2773 { 2774 struct fs *fs; 2775 struct fbuf *fbp; 2776 daddr_t lbn; 2777 uint_t bsize; 2778 int err; 2779 2780 CPU_STATS_ADD_K(sys, ufsdirblk, 1); 2781 fs = ip->i_fs; 2782 lbn = (daddr_t)lblkno(fs, offset); 2783 bsize = (uint_t)blksize(fs, ip, lbn); 2784 err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask), 2785 bsize, S_READ, &fbp); 2786 if (err) { 2787 *fbpp = (struct fbuf *)NULL; 2788 return (err); 2789 } 2790 if (res) 2791 *res = fbp->fb_addr + blkoff(fs, offset); 2792 *fbpp = fbp; 2793 return (0); 2794 } 2795 2796 /* 2797 * Do consistency checking: 2798 * record length must be multiple of 4 2799 * entry must fit in rest of its DIRBLKSIZ block 2800 * record must be large enough to contain entry 2801 * name is not longer than MAXNAMLEN 2802 * name must be as long as advertised, and null terminated 2803 * NOTE: record length must not be zero (should be checked previously). 2804 * This routine is only called if dirchk is true. 2805 * It would be nice to set the FSBAD flag in the super-block when 2806 * this routine fails so that a fsck is forced on next reboot, 2807 * but locking is a problem. 2808 */ 2809 static int 2810 dirmangled( 2811 struct inode *dp, 2812 struct direct *ep, 2813 int entryoffsetinblock, 2814 off_t offset) 2815 { 2816 int i; 2817 2818 i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); 2819 if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i || 2820 (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN || 2821 ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) { 2822 dirbad(dp, "mangled entry", offset); 2823 return (1); 2824 } 2825 return (0); 2826 } 2827 2828 static void 2829 dirbad(struct inode *ip, char *how, off_t offset) 2830 { 2831 cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s", 2832 ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how); 2833 } 2834 2835 static int 2836 dirbadname(char *sp, int l) 2837 { 2838 while (l--) { /* check for nulls */ 2839 if (*sp++ == '\0') { 2840 return (1); 2841 } 2842 } 2843 return (*sp); /* check for terminating null */ 2844 } 2845 2846 /* 2847 * Check if a directory is empty or not. 2848 */ 2849 static int 2850 ufs_dirempty( 2851 struct inode *ip, 2852 ino_t parentino, 2853 struct cred *cr) 2854 { 2855 return (ufs_dirscan(ip, parentino, cr, 0)); 2856 } 2857 2858 /* 2859 * clear the .. directory entry. 2860 */ 2861 static int 2862 ufs_dirpurgedotdot( 2863 struct inode *ip, 2864 ino_t parentino, 2865 struct cred *cr) 2866 { 2867 return (ufs_dirscan(ip, parentino, cr, 1)); 2868 } 2869 2870 /* 2871 * Scan the directoy. If clr_dotdot is true clear the .. 2872 * directory else check to see if the directory is empty. 2873 * 2874 * Using a struct dirtemplate here is not precisely 2875 * what we want, but better than using a struct direct. 2876 * 2877 * clr_dotdot is used as a flag to tell us if we need 2878 * to clear the dotdot entry 2879 * 2880 * N.B.: does not handle corrupted directories. 2881 */ 2882 static int 2883 ufs_dirscan( 2884 struct inode *ip, 2885 ino_t parentino, 2886 struct cred *cr, 2887 int clr_dotdot) 2888 { 2889 offset_t off; 2890 struct dirtemplate dbuf; 2891 struct direct *dp = (struct direct *)&dbuf; 2892 int err, count; 2893 int empty = 1; /* Assume it's empty */ 2894 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 2895 2896 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 2897 2898 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 2899 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 2900 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 2901 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 2902 /* 2903 * Since we read MINDIRSIZ, residual must 2904 * be 0 unless we're at end of file. 2905 */ 2906 if (err || count != 0 || dp->d_reclen == 0) { 2907 empty = 0; 2908 break; 2909 } 2910 /* skip empty entries */ 2911 if (dp->d_ino == 0) 2912 continue; 2913 /* accept only "." and ".." */ 2914 if (dp->d_namlen > 2 || dp->d_name[0] != '.') { 2915 empty = 0; 2916 break; 2917 } 2918 /* 2919 * At this point d_namlen must be 1 or 2. 2920 * 1 implies ".", 2 implies ".." if second 2921 * char is also "." 2922 */ 2923 if (dp->d_namlen == 1) 2924 continue; 2925 if (dp->d_name[1] == '.' && 2926 (ino_t)dp->d_ino == parentino) { 2927 /* 2928 * If we're doing a purge we need to check for 2929 * the . and .. entries and clear the d_ino for .. 2930 * 2931 * if clr_dotdot is set ufs_dirscan does not 2932 * check for an empty directory. 2933 */ 2934 if (clr_dotdot) { 2935 /* 2936 * Have to actually zap the .. 2937 * entry in the directory, as 2938 * otherwise someone might have 2939 * dp as its cwd and try to 2940 * open .., which now points to 2941 * an unallocated inode. 2942 */ 2943 empty = ufs_dirclrdotdot(ip, parentino); 2944 break; 2945 } else { 2946 continue; 2947 } 2948 } 2949 empty = 0; 2950 break; 2951 } 2952 return (empty); 2953 } 2954 2955 clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */ 2956 uint64_t dircheck_retry_cnt; 2957 /* 2958 * Check if source directory inode is in the path of the target directory. 2959 * Target is supplied locked. 2960 * 2961 * The source and target inode's should be different upon entry. 2962 */ 2963 int 2964 ufs_dircheckpath( 2965 ino_t source_ino, 2966 struct inode *target, 2967 struct inode *sdp, 2968 struct cred *cr) 2969 { 2970 struct fbuf *fbp; 2971 struct dirtemplate *dirp; 2972 struct inode *ip; 2973 struct ufsvfs *ufsvfsp; 2974 struct inode *tip; 2975 ino_t dotdotino; 2976 int err; 2977 2978 ASSERT(target->i_ufsvfs != NULL); 2979 ASSERT(RW_LOCK_HELD(&target->i_rwlock)); 2980 ASSERT(RW_LOCK_HELD(&sdp->i_rwlock)); 2981 2982 ip = target; 2983 if (ip->i_number == source_ino) { 2984 err = EINVAL; 2985 goto out; 2986 } 2987 if (ip->i_number == UFSROOTINO) { 2988 err = 0; 2989 goto out; 2990 } 2991 /* 2992 * Search back through the directory tree, using the ".." entries. 2993 * Fail any attempt to move a directory into an ancestor directory. 2994 */ 2995 fbp = NULL; 2996 for (;;) { 2997 struct vfs *vfs; 2998 2999 err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp); 3000 if (err) 3001 break; 3002 if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 || 3003 ip->i_size < sizeof (struct dirtemplate)) { 3004 dirbad(ip, "bad size, unlinked or not dir", (off_t)0); 3005 err = ENOTDIR; 3006 break; 3007 } 3008 if (dirp->dotdot_namlen != 2 || 3009 dirp->dotdot_name[0] != '.' || 3010 dirp->dotdot_name[1] != '.') { 3011 dirbad(ip, "mangled .. entry", (off_t)0); 3012 err = ENOTDIR; /* Sanity check */ 3013 break; 3014 } 3015 dotdotino = (ino_t)dirp->dotdot_ino; 3016 if (dotdotino == source_ino) { 3017 err = EINVAL; 3018 break; 3019 } 3020 if (dotdotino == UFSROOTINO) 3021 break; 3022 if (fbp) { 3023 fbrelse(fbp, S_OTHER); 3024 fbp = NULL; 3025 } 3026 vfs = ip->i_vfs; 3027 ufsvfsp = ip->i_ufsvfs; 3028 3029 if (ip != target) { 3030 rw_exit(&ip->i_rwlock); 3031 VN_RELE(ITOV(ip)); 3032 } 3033 /* 3034 * Race to get the inode. 3035 */ 3036 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3037 if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) { 3038 rw_exit(&ufsvfsp->vfs_dqrwlock); 3039 ip = NULL; 3040 break; 3041 } 3042 rw_exit(&ufsvfsp->vfs_dqrwlock); 3043 /* 3044 * If the directory of the source inode (also a directory) 3045 * is the same as this next entry up the chain, then 3046 * we know the source directory itself can't be in the 3047 * chain. This also prevents a panic because we already 3048 * have sdp->i_rwlock locked. 3049 */ 3050 if (tip == sdp) { 3051 VN_RELE(ITOV(tip)); 3052 ip = NULL; 3053 break; 3054 } 3055 ip = tip; 3056 3057 /* 3058 * If someone has set the WRITE_WANTED bit in this lock and if 3059 * this happens to be a sdp or tdp of another parallel rename 3060 * which is executing the same code and in similar situation 3061 * we end up in a 4 way deadlock. We need to make sure that 3062 * the WRITE_WANTED bit is not set. 3063 */ 3064 retry_lock: 3065 if (!rw_tryenter(&ip->i_rwlock, RW_READER)) { 3066 /* 3067 * If the lock held as WRITER thats fine but if it 3068 * has WRITE_WANTED bit set we might end up in a 3069 * deadlock. If WRITE_WANTED is set we return 3070 * with EAGAIN else we just go back and try. 3071 */ 3072 if (RW_ISWRITER(&ip->i_rwlock) && 3073 !(RW_WRITE_HELD(&ip->i_rwlock))) { 3074 err = EAGAIN; 3075 if (fbp) { 3076 fbrelse(fbp, S_OTHER); 3077 } 3078 VN_RELE(ITOV(ip)); 3079 return (err); 3080 } else { 3081 /* 3082 * The lock is being write held. We could 3083 * just do a rw_enter here but there is a 3084 * window between the check and now, where 3085 * the status could have changed, so to 3086 * avoid looping we backoff and go back to 3087 * try for the lock. 3088 */ 3089 delay(retry_backoff_delay); 3090 dircheck_retry_cnt++; 3091 goto retry_lock; 3092 } 3093 } 3094 } 3095 if (fbp) { 3096 fbrelse(fbp, S_OTHER); 3097 } 3098 out: 3099 if (ip) { 3100 if (ip != target) { 3101 rw_exit(&ip->i_rwlock); 3102 VN_RELE(ITOV(ip)); 3103 } 3104 } 3105 return (err); 3106 } 3107 3108 int 3109 ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr) 3110 { 3111 offset_t off; 3112 struct dirtemplate dbuf; 3113 struct direct *dp = (struct direct *)&dbuf; 3114 int err, count; 3115 int empty = 1; /* Assume it's empty */ 3116 #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) 3117 3118 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3119 3120 ASSERT(ip->i_size <= (offset_t)MAXOFF_T); 3121 for (off = 0; off < ip->i_size; off += dp->d_reclen) { 3122 err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, 3123 (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); 3124 /* 3125 * Since we read MINDIRSIZ, residual must 3126 * be 0 unless we're at end of file. 3127 */ 3128 3129 if (err || count != 0 || dp->d_reclen == 0) { 3130 empty = 0; 3131 break; 3132 } 3133 /* skip empty entries */ 3134 if (dp->d_ino == 0) 3135 continue; 3136 /* 3137 * At this point d_namlen must be 1 or 2. 3138 * 1 implies ".", 2 implies ".." if second 3139 * char is also "." 3140 */ 3141 3142 if (dp->d_namlen == 1 && dp->d_name[0] == '.' && 3143 (ino_t)dp->d_ino == parentino) 3144 continue; 3145 3146 if (dp->d_namlen == 2 && dp->d_name[0] == '.' && 3147 dp->d_name[1] == '.') { 3148 continue; 3149 } 3150 empty = 0; 3151 break; 3152 } 3153 return (empty); 3154 } 3155 3156 3157 /* 3158 * Allocate and initialize a new shadow inode to contain extended attributes. 3159 */ 3160 int 3161 ufs_xattrmkdir( 3162 struct inode *tdp, 3163 struct inode **ipp, 3164 int flags, 3165 struct cred *cr) 3166 { 3167 struct inode *ip; 3168 struct vattr va; 3169 int err; 3170 int retry = 1; 3171 struct ufsvfs *ufsvfsp; 3172 struct ulockfs *ulp; 3173 int issync; 3174 int trans_size; 3175 int dorwlock; /* 0 = not yet taken, */ 3176 /* 1 = taken outside the transaction, */ 3177 /* 2 = taken inside the transaction */ 3178 3179 /* 3180 * Validate permission to create attribute directory 3181 */ 3182 3183 if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) { 3184 return (err); 3185 } 3186 3187 if (vn_is_readonly(ITOV(tdp))) 3188 return (EROFS); 3189 3190 /* 3191 * No need to re-init err after again:, since it's set before 3192 * the next use of it. 3193 */ 3194 again: 3195 dorwlock = 0; 3196 va.va_type = VDIR; 3197 va.va_uid = tdp->i_uid; 3198 va.va_gid = tdp->i_gid; 3199 3200 if ((tdp->i_mode & IFMT) == IFDIR) { 3201 va.va_mode = (o_mode_t)IFATTRDIR; 3202 va.va_mode |= tdp->i_mode & 0777; 3203 } else { 3204 va.va_mode = (o_mode_t)IFATTRDIR|0700; 3205 if (tdp->i_mode & 0040) 3206 va.va_mode |= 0750; 3207 if (tdp->i_mode & 0004) 3208 va.va_mode |= 0705; 3209 } 3210 va.va_mask = AT_TYPE|AT_MODE; 3211 3212 ufsvfsp = tdp->i_ufsvfs; 3213 3214 err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); 3215 if (err) 3216 return (err); 3217 3218 /* 3219 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. 3220 * This follows the protocol for read()/write(). 3221 */ 3222 if (ITOV(tdp)->v_type != VDIR) { 3223 rw_enter(&tdp->i_rwlock, RW_WRITER); 3224 dorwlock = 1; 3225 } 3226 3227 if (ulp) { 3228 trans_size = (int)TOP_MKDIR_SIZE(tdp); 3229 TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size); 3230 } 3231 3232 /* 3233 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. 3234 * This follows the protocol established by 3235 * ufs_link/create/remove/rename/mkdir/rmdir/symlink. 3236 */ 3237 if (dorwlock == 0) { 3238 rw_enter(&tdp->i_rwlock, RW_WRITER); 3239 dorwlock = 2; 3240 } 3241 rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); 3242 rw_enter(&tdp->i_contents, RW_WRITER); 3243 3244 /* 3245 * Suppress out of inodes messages if we will retry. 3246 */ 3247 if (retry) 3248 tdp->i_flag |= IQUIET; 3249 err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr); 3250 tdp->i_flag &= ~IQUIET; 3251 3252 if (err) 3253 goto fail; 3254 3255 if (flags) { 3256 3257 /* 3258 * Now attach it to src file. 3259 */ 3260 3261 tdp->i_oeftflag = ip->i_number; 3262 } 3263 3264 ip->i_cflags |= IXATTR; 3265 ITOV(ip)->v_flag |= V_XATTRDIR; 3266 TRANS_INODE(ufsvfsp, tdp); 3267 tdp->i_flag |= ICHG | IUPD; 3268 tdp->i_seq++; 3269 ufs_iupdat(tdp, I_SYNC); 3270 rw_exit(&tdp->i_contents); 3271 rw_exit(&ufsvfsp->vfs_dqrwlock); 3272 3273 rw_enter(&ip->i_rwlock, RW_WRITER); 3274 rw_enter(&ip->i_contents, RW_WRITER); 3275 TRANS_INODE(ufsvfsp, ip); 3276 ip->i_flag |= ICHG| IUPD; 3277 ip->i_seq++; 3278 ufs_iupdat(ip, I_SYNC); 3279 rw_exit(&ip->i_contents); 3280 rw_exit(&ip->i_rwlock); 3281 if (dorwlock == 2) 3282 rw_exit(&tdp->i_rwlock); 3283 if (ulp) { 3284 int terr = 0; 3285 3286 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3287 ufs_lockfs_end(ulp); 3288 if (err == 0) 3289 err = terr; 3290 } 3291 if (dorwlock == 1) 3292 rw_exit(&tdp->i_rwlock); 3293 *ipp = ip; 3294 return (err); 3295 3296 fail: 3297 rw_exit(&tdp->i_contents); 3298 rw_exit(&ufsvfsp->vfs_dqrwlock); 3299 if (dorwlock == 2) 3300 rw_exit(&tdp->i_rwlock); 3301 if (ulp) { 3302 TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); 3303 ufs_lockfs_end(ulp); 3304 } 3305 if (dorwlock == 1) 3306 rw_exit(&tdp->i_rwlock); 3307 if (ip != NULL) 3308 VN_RELE(ITOV(ip)); 3309 3310 /* 3311 * No inodes? See if any are tied up in pending deletions. 3312 * This has to be done outside of any of the above, because 3313 * the draining operation can't be done from inside a transaction. 3314 */ 3315 if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { 3316 ufs_delete_drain_wait(ufsvfsp, 1); 3317 retry = 0; 3318 goto again; 3319 } 3320 3321 return (err); 3322 } 3323 3324 /* 3325 * clear the dotdot directory entry. 3326 * Used by ufs_dirscan when clr_dotdot 3327 * flag is set and we're deleting a 3328 * directory. 3329 */ 3330 static int 3331 ufs_dirclrdotdot(struct inode *ip, ino_t parentino) 3332 { 3333 struct fbuf *fbp; 3334 struct direct *dotp, *dotdotp; 3335 int err = 0; 3336 3337 ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); 3338 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 3339 err = blkatoff(ip, 0, NULL, &fbp); 3340 if (err) { 3341 return (err); 3342 } 3343 3344 dotp = (struct direct *)fbp->fb_addr; 3345 if ((dotp->d_namlen < (MAXNAMLEN + 1)) && 3346 ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) { 3347 dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen); 3348 if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) && 3349 ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) { 3350 3351 dotp->d_reclen += dotdotp->d_reclen; 3352 if (parentino == dotdotp->d_ino) { 3353 dotdotp->d_ino = 0; 3354 dotdotp->d_namlen = 0; 3355 dotdotp->d_reclen = 0; 3356 } 3357 3358 err = TRANS_DIR(ip, 0); 3359 if (err) { 3360 fbrelse(fbp, S_OTHER); 3361 } else { 3362 err = ufs_fbwrite(fbp, ip); 3363 } 3364 } 3365 } else { 3366 err = -1; 3367 } 3368 return (err); 3369 } 3370