1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 41 #pragma ident "%Z%%M% %I% %E% SMI" 42 43 #include <sys/types.h> 44 #include <sys/t_lock.h> 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/uio.h> 48 #include <sys/bitmap.h> 49 #include <sys/signal.h> 50 #include <sys/cred.h> 51 #include <sys/user.h> 52 #include <sys/vfs.h> 53 #include <sys/stat.h> 54 #include <sys/vnode.h> 55 #include <sys/buf.h> 56 #include <sys/proc.h> 57 #include <sys/disp.h> 58 #include <sys/dnlc.h> 59 #include <sys/mode.h> 60 #include <sys/cmn_err.h> 61 #include <sys/kstat.h> 62 #include <sys/acl.h> 63 #include <sys/var.h> 64 #include <sys/fs/ufs_inode.h> 65 #include <sys/fs/ufs_fs.h> 66 #include <sys/fs/ufs_trans.h> 67 #include <sys/fs/ufs_acl.h> 68 #include <sys/fs/ufs_bio.h> 69 #include <sys/fs/ufs_quota.h> 70 #include <sys/fs/ufs_log.h> 71 #include <vm/hat.h> 72 #include <vm/as.h> 73 #include <vm/pvn.h> 74 #include <vm/seg.h> 75 #include <sys/swap.h> 76 #include <sys/cpuvar.h> 77 #include <sys/sysmacros.h> 78 #include <sys/errno.h> 79 #include <sys/kmem.h> 80 #include <sys/debug.h> 81 #include <fs/fs_subr.h> 82 #include <sys/policy.h> 83 84 struct kmem_cache *inode_cache; /* cache of free inodes */ 85 86 /* UFS Inode Cache Stats -- Not protected */ 87 struct instats ins = { 88 { "size", KSTAT_DATA_ULONG }, 89 { "maxsize", KSTAT_DATA_ULONG }, 90 { "hits", KSTAT_DATA_ULONG }, 91 { "misses", KSTAT_DATA_ULONG }, 92 { "kmem allocs", KSTAT_DATA_ULONG }, 93 { "kmem frees", KSTAT_DATA_ULONG }, 94 { "maxsize reached", KSTAT_DATA_ULONG }, 95 { "puts at frontlist", KSTAT_DATA_ULONG }, 96 { "puts at backlist", KSTAT_DATA_ULONG }, 97 { "queues to free", KSTAT_DATA_ULONG }, 98 { "scans", KSTAT_DATA_ULONG }, 99 { "thread idles", KSTAT_DATA_ULONG }, 100 { "lookup idles", KSTAT_DATA_ULONG }, 101 { "vget idles", KSTAT_DATA_ULONG }, 102 { "cache allocs", KSTAT_DATA_ULONG }, 103 { "cache frees", KSTAT_DATA_ULONG }, 104 { "pushes at close", KSTAT_DATA_ULONG } 105 }; 106 107 /* kstat data */ 108 static kstat_t *ufs_inode_kstat = NULL; 109 110 union ihead *ihead; /* inode LRU cache, Chris Maltby */ 111 kmutex_t *ih_lock; /* protect inode cache hash table */ 112 static int ino_hashlen = 4; /* desired average hash chain length */ 113 int inohsz; /* number of buckets in the hash table */ 114 115 kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */ 116 kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */ 117 kmutex_t ufsvfs_mutex; 118 struct ufsvfs *oldufsvfslist, *ufsvfslist; 119 120 /* 121 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no 122 * I/Os are going on. 123 */ 124 clock_t ufs_iowait; 125 126 /* 127 * the threads that process idle inodes and free (deleted) inodes 128 * have high water marks that are set in ufsinit(). 129 * These values but can be no less then the minimum shown below 130 */ 131 int ufs_idle_max; /* # of allowable idle inodes */ 132 ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */ 133 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */ 134 135 /* 136 * Tunables for ufs write throttling. 137 * These are validated in ufs_iinit() since improper settings 138 * can lead to filesystem hangs. 139 */ 140 #define UFS_HW_DEFAULT (16 * 1024 * 1024) 141 #define UFS_LW_DEFAULT (8 * 1024 * 1024) 142 int ufs_HW = UFS_HW_DEFAULT; 143 int ufs_LW = UFS_LW_DEFAULT; 144 145 static void ihinit(void); 146 extern int hash2ints(int, int); 147 148 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **, 149 struct cred *, int); 150 151 /* ARGSUSED */ 152 static int 153 ufs_inode_kstat_update(kstat_t *ksp, int rw) 154 { 155 if (rw == KSTAT_WRITE) 156 return (EACCES); 157 158 ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 159 "slab_alloc"); 160 ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 161 "slab_free"); 162 ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 163 "alloc"); 164 ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 165 "free"); 166 ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 167 "buf_inuse"); 168 ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 169 "buf_max"); 170 ins.in_misses.value.ul = ins.in_kcalloc.value.ul; 171 172 return (0); 173 } 174 175 void 176 ufs_iinit(void) 177 { 178 /* 179 * Validate that ufs_HW > ufs_LW. 180 * The default values for these two tunables have been increased. 181 * There is now a range of values for ufs_HW that used to be 182 * legal on previous Solaris versions but no longer is now. 183 * Upgrading a machine which has an /etc/system setting for ufs_HW 184 * from that range can lead to filesystem hangs unless the values 185 * are checked here. 186 */ 187 if (ufs_HW <= ufs_LW) { 188 cmn_err(CE_WARN, 189 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.", 190 ufs_HW, ufs_LW); 191 ufs_LW = UFS_LW_DEFAULT; 192 ufs_HW = UFS_HW_DEFAULT; 193 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n", 194 ufs_HW, ufs_LW); 195 } 196 197 /* 198 * Adjust the tunable `ufs_ninode' to a reasonable value 199 */ 200 if (ufs_ninode <= 0) 201 ufs_ninode = ncsize; 202 if (ufs_inode_max == 0) 203 ufs_inode_max = (ulong_t)((kmem_maxavail() >> 2) / 204 sizeof (struct inode)); 205 if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) { 206 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld", 207 ufs_inode_max); 208 ufs_ninode = ufs_inode_max; 209 } 210 /* 211 * Wait till third call of ufs_update to declare that no I/Os are 212 * going on. This allows deferred access times to be flushed to disk. 213 */ 214 ufs_iowait = v.v_autoup * hz * 2; 215 216 /* 217 * idle thread runs when 25% of ufs_ninode entries are on the queue 218 */ 219 if (ufs_idle_max == 0) 220 ufs_idle_max = ufs_ninode >> 2; 221 if (ufs_idle_max < UFS_IDLE_MAX) 222 ufs_idle_max = UFS_IDLE_MAX; 223 if (ufs_idle_max > ufs_ninode) 224 ufs_idle_max = ufs_ninode; 225 /* 226 * This is really a misnomer, it is ufs_queue_init 227 */ 228 ufs_thread_init(&ufs_idle_q, ufs_idle_max); 229 ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL); 230 231 /* 232 * global hlock thread 233 */ 234 ufs_thread_init(&ufs_hlock, 1); 235 ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL); 236 237 ihinit(); 238 qtinit(); 239 ins.in_maxsize.value.ul = ufs_ninode; 240 if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs", 241 KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t), 242 KSTAT_FLAG_VIRTUAL)) != NULL) { 243 ufs_inode_kstat->ks_data = (void *)&ins; 244 ufs_inode_kstat->ks_update = ufs_inode_kstat_update; 245 kstat_install(ufs_inode_kstat); 246 } 247 ufsfx_init(); /* fix-on-panic initialization */ 248 si_cache_init(); 249 ufs_directio_init(); 250 lufs_init(); 251 mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL); 252 } 253 254 /* ARGSUSED */ 255 static int 256 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags) 257 { 258 struct inode *ip = buf; 259 struct vnode *vp; 260 261 rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL); 262 rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL); 263 mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL); 264 dnlc_dir_init(&ip->i_danchor); 265 266 cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL); 267 268 vp = vn_alloc(KM_SLEEP); 269 ip->i_vnode = vp; 270 271 vn_setops(vp, ufs_vnodeops); 272 vp->v_data = (caddr_t)ip; 273 274 return (0); 275 } 276 277 /* ARGSUSED */ 278 static void 279 ufs_inode_cache_destructor(void *buf, void *cdrarg) 280 { 281 struct inode *ip = buf; 282 struct vnode *vp; 283 284 vp = ITOV(ip); 285 286 rw_destroy(&ip->i_rwlock); 287 rw_destroy(&ip->i_contents); 288 289 mutex_destroy(&ip->i_tlock); 290 if (vp->v_type == VDIR) { 291 dnlc_dir_fini(&ip->i_danchor); 292 } 293 294 cv_destroy(&ip->i_wrcv); 295 296 vn_free(vp); 297 } 298 299 /* 300 * Initialize hash links for inodes 301 * and build inode free list. 302 */ 303 void 304 ihinit(void) 305 { 306 int i; 307 union ihead *ih = ihead; 308 309 mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL); 310 311 inohsz = 1 << highbit(ufs_ninode / ino_hashlen); 312 ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP); 313 ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP); 314 315 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 316 ih->ih_head[0] = ih; 317 ih->ih_head[1] = ih; 318 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL); 319 } 320 inode_cache = kmem_cache_create("ufs_inode_cache", 321 sizeof (struct inode), 0, ufs_inode_cache_constructor, 322 ufs_inode_cache_destructor, ufs_inode_cache_reclaim, 323 NULL, NULL, 0); 324 } 325 326 /* 327 * Free an inode structure 328 */ 329 void 330 ufs_free_inode(struct inode *ip) 331 { 332 vn_invalid(ITOV(ip)); 333 kmem_cache_free(inode_cache, ip); 334 } 335 336 /* 337 * Allocate an inode structure 338 */ 339 struct inode * 340 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino) 341 { 342 struct inode *ip; 343 vnode_t *vp; 344 345 ip = kmem_cache_alloc(inode_cache, KM_SLEEP); 346 /* 347 * at this point we have a newly allocated inode 348 */ 349 ip->i_freef = ip; 350 ip->i_freeb = ip; 351 ip->i_flag = IREF; 352 ip->i_seq = 0xFF; /* Unique initial value */ 353 ip->i_dev = ufsvfsp->vfs_dev; 354 ip->i_ufsvfs = ufsvfsp; 355 ip->i_devvp = ufsvfsp->vfs_devvp; 356 ip->i_number = ino; 357 ip->i_diroff = 0; 358 ip->i_nextr = 0; 359 ip->i_map = NULL; 360 ip->i_rdev = 0; 361 ip->i_writes = 0; 362 ip->i_mode = 0; 363 ip->i_delaylen = 0; 364 ip->i_delayoff = 0; 365 ip->i_nextrio = 0; 366 ip->i_ufs_acl = NULL; 367 ip->i_cflags = 0; 368 ip->i_mapcnt = 0; 369 ip->i_dquot = NULL; 370 ip->i_cachedir = 1; 371 ip->i_writer = NULL; 372 373 /* 374 * the vnode for this inode was allocated by the constructor 375 */ 376 vp = ITOV(ip); 377 vn_reinit(vp); 378 if (ino == (ino_t)UFSROOTINO) 379 vp->v_flag = VROOT; 380 vp->v_vfsp = ufsvfsp->vfs_vfs; 381 vn_exists(vp); 382 return (ip); 383 } 384 385 /* 386 * Look up an inode by device, inumber. If it is in core (in the 387 * inode structure), honor the locking protocol. If it is not in 388 * core, read it in from the specified device after freeing any pages. 389 * In all cases, a pointer to a VN_HELD inode structure is returned. 390 */ 391 int 392 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr) 393 { 394 return (ufs_iget_internal(vfsp, ino, ipp, cr, 0)); 395 } 396 397 /* 398 * A version of ufs_iget which returns only allocated, linked inodes. 399 * This is appropriate for any callers who do not expect a free inode. 400 */ 401 int 402 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp, 403 struct cred *cr) 404 { 405 return (ufs_iget_internal(vfsp, ino, ipp, cr, 1)); 406 } 407 408 /* 409 * Set vnode attributes based on v_type, this should be called whenever 410 * an inode's i_mode is changed. 411 */ 412 void 413 ufs_reset_vnode(vnode_t *vp) 414 { 415 /* 416 * an old DBE hack 417 */ 418 if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) 419 vp->v_flag |= VSWAPLIKE; 420 else 421 vp->v_flag &= ~VSWAPLIKE; 422 423 /* 424 * if not swap like and it's just a regular file, we want 425 * to maintain the vnode's pages sorted by clean/modified 426 * for faster sync'ing to disk 427 */ 428 if (vp->v_type == VREG) 429 vp->v_flag |= VMODSORT; 430 else 431 vp->v_flag &= ~VMODSORT; 432 433 /* 434 * Is this an attribute hidden dir? 435 */ 436 if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR) 437 vp->v_flag |= V_XATTRDIR; 438 else 439 vp->v_flag &= ~V_XATTRDIR; 440 } 441 442 /* 443 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate' 444 * flag is used to distinguish the two; when true, we validate that the inode 445 * being retrieved looks like a linked and allocated inode. 446 */ 447 /* ARGSUSED */ 448 static int 449 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp, 450 struct cred *cr, int validate) 451 { 452 struct inode *ip, *sp; 453 union ihead *ih; 454 kmutex_t *ihm; 455 struct buf *bp; 456 struct dinode *dp; 457 struct vnode *vp; 458 extern vfs_t EIO_vfs; 459 int error; 460 int ftype; /* XXX - Remove later on */ 461 dev_t vfs_dev; 462 struct ufsvfs *ufsvfsp; 463 struct fs *fs; 464 int hno; 465 daddr_t bno; 466 ulong_t ioff; 467 468 CPU_STATS_ADD_K(sys, ufsiget, 1); 469 470 /* 471 * Lookup inode in cache. 472 */ 473 vfs_dev = vfsp->vfs_dev; 474 hno = INOHASH(ino); 475 ih = &ihead[hno]; 476 ihm = &ih_lock[hno]; 477 478 again: 479 mutex_enter(ihm); 480 for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) { 481 if (ino != ip->i_number || vfs_dev != ip->i_dev || 482 (ip->i_flag & ISTALE)) 483 continue; 484 485 /* 486 * Found the interesting inode; hold it and drop the cache lock 487 */ 488 vp = ITOV(ip); /* for locknest */ 489 VN_HOLD(vp); 490 mutex_exit(ihm); 491 rw_enter(&ip->i_contents, RW_READER); 492 493 /* 494 * if necessary, remove from idle list 495 */ 496 if ((ip->i_flag & IREF) == 0) { 497 if (ufs_rmidle(ip)) 498 VN_RELE(vp); 499 } 500 501 /* 502 * Could the inode be read from disk? 503 */ 504 if (ip->i_flag & ISTALE) { 505 rw_exit(&ip->i_contents); 506 VN_RELE(vp); 507 goto again; 508 } 509 510 ins.in_hits.value.ul++; 511 *ipp = ip; 512 513 /* 514 * Reset the vnode's attribute flags 515 */ 516 mutex_enter(&vp->v_lock); 517 ufs_reset_vnode(vp); 518 mutex_exit(&vp->v_lock); 519 520 rw_exit(&ip->i_contents); 521 522 return (0); 523 } 524 mutex_exit(ihm); 525 526 /* 527 * Inode was not in cache. 528 * 529 * Allocate a new entry 530 */ 531 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 532 fs = ufsvfsp->vfs_fs; 533 534 ip = ufs_alloc_inode(ufsvfsp, ino); 535 vp = ITOV(ip); 536 537 bno = fsbtodb(fs, itod(fs, ino)); 538 ioff = (sizeof (struct dinode)) * (itoo(fs, ino)); 539 ip->i_doff = (offset_t)ioff + ldbtob(bno); 540 541 /* 542 * put a place holder in the cache (if not already there) 543 */ 544 mutex_enter(ihm); 545 for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw) 546 if (ino == sp->i_number && vfs_dev == sp->i_dev && 547 ((sp->i_flag & ISTALE) == 0)) { 548 mutex_exit(ihm); 549 ufs_free_inode(ip); 550 goto again; 551 } 552 /* 553 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock)) 554 * here, but if we do, then shadow inode allocations panic the 555 * system. We don't have to hold vfs_dqrwlock for shadow inodes 556 * and the ufs_iget() parameters don't tell us what we are getting 557 * so we have no way of knowing this is a ufs_iget() call from 558 * a ufs_ialloc() call for a shadow inode. 559 */ 560 rw_enter(&ip->i_contents, RW_WRITER); 561 insque(ip, ih); 562 mutex_exit(ihm); 563 /* 564 * read the dinode 565 */ 566 bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize); 567 568 /* 569 * Check I/O errors 570 */ 571 error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0); 572 if (error) { 573 brelse(bp); 574 ip->i_flag |= ISTALE; /* in case someone is looking it up */ 575 rw_exit(&ip->i_contents); 576 vp->v_vfsp = &EIO_vfs; 577 VN_RELE(vp); 578 return (error); 579 } 580 /* 581 * initialize the inode's dinode 582 */ 583 dp = (struct dinode *)(ioff + bp->b_un.b_addr); 584 ip->i_ic = dp->di_ic; /* structure assignment */ 585 brelse(bp); 586 587 /* 588 * Maintain compatibility with Solaris 1.x UFS 589 */ 590 if (ip->i_suid != UID_LONG) 591 ip->i_uid = ip->i_suid; 592 if (ip->i_sgid != GID_LONG) 593 ip->i_gid = ip->i_sgid; 594 595 ftype = ip->i_mode & IFMT; 596 if (ftype == IFBLK || ftype == IFCHR) { 597 dev_t dv; 598 uint_t top16 = ip->i_ordev & 0xffff0000u; 599 600 if (top16 == 0 || top16 == 0xffff0000u) 601 dv = expdev(ip->i_ordev); 602 else 603 dv = expldev(ip->i_ordev); 604 vp->v_rdev = ip->i_rdev = dv; 605 } 606 607 /* 608 * if our caller only expects allocated inodes, verify that 609 * this inode looks good; throw it out if it's bad. 610 */ 611 if (validate) { 612 if ((ftype == 0) || (ip->i_nlink <= 0)) { 613 ip->i_flag |= ISTALE; 614 rw_exit(&ip->i_contents); 615 vp->v_vfsp = &EIO_vfs; 616 VN_RELE(vp); 617 cmn_err(CE_NOTE, 618 "%s: unexpected free inode %d, run fsck(1M)%s", 619 fs->fs_fsmnt, (int)ino, 620 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 621 return (EIO); 622 } 623 } 624 625 /* 626 * finish initializing the vnode 627 */ 628 vp->v_type = IFTOVT((mode_t)ip->i_mode); 629 630 ufs_reset_vnode(vp); 631 632 /* 633 * read the shadow 634 */ 635 if (ftype != 0 && ip->i_shadow != 0) { 636 if ((error = ufs_si_load(ip, cr)) != 0) { 637 ip->i_flag |= ISTALE; 638 ip->i_ufs_acl = NULL; 639 rw_exit(&ip->i_contents); 640 vp->v_vfsp = &EIO_vfs; 641 VN_RELE(vp); 642 return (error); 643 } 644 } 645 646 /* 647 * Only attach quota information if the inode has a type and if 648 * that type is not a shadow inode. 649 */ 650 if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) && 651 ((ip->i_mode & IFMT) != IFATTRDIR)) { 652 ip->i_dquot = getinoquota(ip); 653 } 654 TRANS_MATA_IGET(ufsvfsp, ip); 655 *ipp = ip; 656 rw_exit(&ip->i_contents); 657 658 return (0); 659 } 660 661 /* 662 * Vnode is no longer referenced, write the inode out 663 * and if necessary, truncate and deallocate the file. 664 */ 665 void 666 ufs_iinactive(struct inode *ip) 667 { 668 int front; 669 struct inode *iq; 670 struct inode *hip; 671 struct ufs_q *uq; 672 struct vnode *vp = ITOV(ip); 673 674 675 /* 676 * Because the vnode type might have been changed, 677 * the dnlc_dir_purge must be called unconditionally. 678 */ 679 dnlc_dir_purge(&ip->i_danchor); 680 681 /* 682 * Get exclusive access to inode data. 683 */ 684 rw_enter(&ip->i_contents, RW_WRITER); 685 ASSERT(ip->i_flag & IREF); 686 687 /* 688 * Make sure no one reclaimed the inode before we put it on 689 * the freelist or destroy it. We keep our 'hold' on the vnode 690 * from vn_rele until we are ready to do something with the inode. 691 * 692 * Pageout may put a VN_HOLD/VN_RELE at anytime during this 693 * operation via an async putpage, so we must make sure 694 * we don't free/destroy the inode more than once. ufs_iget 695 * may also put a VN_HOLD on the inode before it grabs 696 * the i_contents lock. This is done so we don't free 697 * an inode that a thread is waiting on. 698 */ 699 mutex_enter(&vp->v_lock); 700 701 if (vp->v_count > 1) { 702 vp->v_count--; /* release our hold from vn_rele */ 703 mutex_exit(&vp->v_lock); 704 rw_exit(&ip->i_contents); 705 return; 706 } 707 mutex_exit(&vp->v_lock); 708 709 /* 710 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed 711 * and clean. It can be safely destroyed (cyf). 712 */ 713 if (ip->i_ufsvfs == NULL) { 714 rw_exit(&ip->i_contents); 715 ufs_si_del(ip); 716 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 717 ufs_free_inode(ip); 718 return; 719 } 720 721 /* 722 * queue idle inode to appropriate thread. Will check v_count == 1 723 * prior to putting this on the appropriate queue. 724 * Stale inodes will be unhashed and freed by the ufs idle thread 725 * in ufs_idle_free() 726 */ 727 front = 1; 728 if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 && 729 ip->i_mode && ip->i_nlink <= 0) { 730 /* 731 * Mark the i_flag to indicate that inode is being deleted. 732 * This flag will be cleared when the deletion is complete. 733 * This prevents nfs from sneaking in via ufs_vget() while 734 * the delete is in progress (bugid 1242481). 735 */ 736 ip->i_flag |= IDEL; 737 738 /* 739 * NOIDEL means that deletes are not allowed at this time; 740 * whoever resets NOIDEL will also send this inode back 741 * through ufs_iinactive. IREF remains set. 742 */ 743 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 744 mutex_enter(&vp->v_lock); 745 vp->v_count--; 746 mutex_exit(&vp->v_lock); 747 rw_exit(&ip->i_contents); 748 return; 749 } 750 if (!TRANS_ISTRANS(ip->i_ufsvfs)) { 751 rw_exit(&ip->i_contents); 752 ufs_delete(ip->i_ufsvfs, ip, 0); 753 return; 754 } 755 756 /* queue to delete thread; IREF remains set */ 757 ins.in_qfree.value.ul++; 758 uq = &ip->i_ufsvfs->vfs_delete; 759 760 mutex_enter(&uq->uq_mutex); 761 762 /* add to q */ 763 if ((iq = uq->uq_ihead) != 0) { 764 ip->i_freef = iq; 765 ip->i_freeb = iq->i_freeb; 766 iq->i_freeb->i_freef = ip; 767 iq->i_freeb = ip; 768 if (front) 769 uq->uq_ihead = ip; 770 } else { 771 uq->uq_ihead = ip; 772 ip->i_freef = ip; 773 ip->i_freeb = ip; 774 } 775 } else { 776 /* 777 * queue to idle thread 778 * Check the v_count == 1 again. 779 * 780 */ 781 mutex_enter(&vp->v_lock); 782 if (vp->v_count > 1) { 783 vp->v_count--; /* release our hold from vn_rele */ 784 mutex_exit(&vp->v_lock); 785 rw_exit(&ip->i_contents); 786 return; 787 } 788 mutex_exit(&vp->v_lock); 789 uq = &ufs_idle_q; 790 791 /* 792 * useful iff it has pages or is a fastsymlink; otherwise junk 793 */ 794 mutex_enter(&uq->uq_mutex); 795 796 /* clear IREF means `on idle list' */ 797 ip->i_flag &= ~(IREF | IDIRECTIO); 798 799 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) { 800 ins.in_frback.value.ul++; 801 hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)]; 802 ufs_nuseful_iq++; 803 } else { 804 ins.in_frfront.value.ul++; 805 hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)]; 806 ip->i_flag |= IJUNKIQ; 807 ufs_njunk_iq++; 808 } 809 ip->i_freef = hip; 810 ip->i_freeb = hip->i_freeb; 811 hip->i_freeb->i_freef = ip; 812 hip->i_freeb = ip; 813 } 814 815 /* wakeup thread(s) if q is overfull */ 816 if (++uq->uq_ne == uq->uq_lowat) 817 cv_broadcast(&uq->uq_cv); 818 819 /* all done, release the q and inode */ 820 mutex_exit(&uq->uq_mutex); 821 rw_exit(&ip->i_contents); 822 } 823 824 /* 825 * Check accessed and update flags on an inode structure. 826 * If any are on, update the inode with the (unique) current time. 827 * If waitfor is given, insure I/O order so wait for write to complete. 828 */ 829 void 830 ufs_iupdat(struct inode *ip, int waitfor) 831 { 832 struct buf *bp; 833 struct fs *fp; 834 struct dinode *dp; 835 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 836 int i; 837 int do_trans_times; 838 ushort_t flag; 839 o_uid_t suid; 840 o_gid_t sgid; 841 842 /* 843 * This function is now safe to be called with either the reader 844 * or writer i_contents lock. 845 */ 846 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 847 848 /* 849 * Return if file system has been forcibly umounted. 850 */ 851 if (ufsvfsp == NULL) 852 return; 853 854 flag = ip->i_flag; /* Atomic read */ 855 /* 856 * We better not update the disk inode from a stale inode. 857 */ 858 if (flag & ISTALE) 859 return; 860 861 fp = ip->i_fs; 862 863 if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) { 864 if (fp->fs_ronly) { 865 mutex_enter(&ip->i_tlock); 866 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 867 mutex_exit(&ip->i_tlock); 868 return; 869 } 870 /* 871 * fs is active while metadata is being written 872 */ 873 mutex_enter(&ufsvfsp->vfs_lock); 874 ufs_notclean(ufsvfsp); 875 /* 876 * get the dinode 877 */ 878 bp = UFS_BREAD(ufsvfsp, ip->i_dev, 879 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)), 880 (int)fp->fs_bsize); 881 if (bp->b_flags & B_ERROR) { 882 mutex_enter(&ip->i_tlock); 883 ip->i_flag &= 884 ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 885 mutex_exit(&ip->i_tlock); 886 brelse(bp); 887 return; 888 } 889 /* 890 * munge inode fields 891 */ 892 mutex_enter(&ip->i_tlock); 893 ITIMES_NOLOCK(ip); 894 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC); 895 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 896 mutex_exit(&ip->i_tlock); 897 898 /* 899 * For reads and concurrent re-writes, no deltas were 900 * entered for the access time changes - do it now. 901 */ 902 if (do_trans_times) { 903 TRANS_INODE_TIMES(ufsvfsp, ip); 904 } 905 906 /* 907 * For SunOS 5.0->5.4, these lines below read: 908 * 909 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 910 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 911 * 912 * where MAXUID was set to 60002. This was incorrect - 913 * the uids should have been constrained to what fitted into 914 * a 16-bit word. 915 * 916 * This means that files from 4.x filesystems that have an 917 * i_suid field larger than 60002 will have that field 918 * changed to 65535. 919 * 920 * Security note: 4.x UFS could never create a i_suid of 921 * UID_LONG since that would've corresponded to -1. 922 */ 923 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 924 UID_LONG : ip->i_uid; 925 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 926 GID_LONG : ip->i_gid; 927 928 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) { 929 ip->i_suid = suid; 930 ip->i_sgid = sgid; 931 TRANS_INODE(ufsvfsp, ip); 932 } 933 934 if ((ip->i_mode & IFMT) == IFBLK || 935 (ip->i_mode & IFMT) == IFCHR) { 936 dev_t d = ip->i_rdev; 937 dev32_t dev32; 938 939 /* 940 * load first direct block only if special device 941 */ 942 if (!cmpldev(&dev32, d)) { 943 /* 944 * We panic here because there's "no way" 945 * we should have been able to create a large 946 * inode with a large dev_t. Earlier layers 947 * should've caught this. 948 */ 949 panic("ip %p: i_rdev too big", (void *)ip); 950 } 951 952 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 953 ip->i_ordev = dev32; /* can't use old fmt. */ 954 } else { 955 ip->i_ordev = cmpdev(d); 956 } 957 } 958 959 /* 960 * copy inode to dinode (zero fastsymlnk in dinode) 961 */ 962 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number); 963 dp->di_ic = ip->i_ic; /* structure assignment */ 964 if (flag & IFASTSYMLNK) { 965 for (i = 1; i < NDADDR; i++) 966 dp->di_db[i] = 0; 967 for (i = 0; i < NIADDR; i++) 968 dp->di_ib[i] = 0; 969 } 970 if (TRANS_ISTRANS(ufsvfsp)) { 971 /* 972 * Pass only a sector size buffer containing 973 * the inode, otherwise when the buffer is copied 974 * into a cached roll buffer then too much memory 975 * gets consumed if 8KB inode buffers are passed. 976 */ 977 TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff, 978 sizeof (struct dinode), 979 (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE), 980 DEV_BSIZE); 981 982 brelse(bp); 983 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) { 984 UFS_BRWRITE(ufsvfsp, bp); 985 986 /* 987 * Synchronous write has guaranteed that inode 988 * has been written on disk so clear the flag 989 */ 990 mutex_enter(&ip->i_tlock); 991 ip->i_flag &= ~IBDWRITE; 992 mutex_exit(&ip->i_tlock); 993 } else { 994 bdrwrite(bp); 995 996 /* 997 * This write hasn't guaranteed that inode has been 998 * written on the disk. 999 * Since, all updat flags on inode are cleared, we must 1000 * remember the condition in case inode is to be updated 1001 * synchronously later (e.g.- fsync()/fdatasync()) 1002 * and inode has not been modified yet. 1003 */ 1004 mutex_enter(&ip->i_tlock); 1005 ip->i_flag |= IBDWRITE; 1006 mutex_exit(&ip->i_tlock); 1007 } 1008 } else { 1009 /* 1010 * In case previous inode update was done asynchronously 1011 * (IBDWRITE) and this inode update request wants guaranteed 1012 * (synchronous) disk update, flush the inode. 1013 */ 1014 if (waitfor && (flag & IBDWRITE)) { 1015 blkflush(ip->i_dev, 1016 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number))); 1017 mutex_enter(&ip->i_tlock); 1018 ip->i_flag &= ~IBDWRITE; 1019 mutex_exit(&ip->i_tlock); 1020 } 1021 } 1022 } 1023 1024 #define SINGLE 0 /* index of single indirect block */ 1025 #define DOUBLE 1 /* index of double indirect block */ 1026 #define TRIPLE 2 /* index of triple indirect block */ 1027 1028 /* 1029 * Release blocks associated with the inode ip and 1030 * stored in the indirect block bn. Blocks are free'd 1031 * in LIFO order up to (but not including) lastbn. If 1032 * level is greater than SINGLE, the block is an indirect 1033 * block and recursive calls to indirtrunc must be used to 1034 * cleanse other indirect blocks. 1035 * 1036 * N.B.: triple indirect blocks are untested. 1037 */ 1038 static long 1039 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags) 1040 { 1041 int i; 1042 struct buf *bp, *copy; 1043 daddr32_t *bap; 1044 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1045 struct fs *fs = ufsvfsp->vfs_fs; 1046 daddr_t nb, last; 1047 long factor; 1048 int blocksreleased = 0, nblocks; 1049 1050 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 1051 /* 1052 * Calculate index in current block of last 1053 * block to be kept. -1 indicates the entire 1054 * block so we need not calculate the index. 1055 */ 1056 factor = 1; 1057 for (i = SINGLE; i < level; i++) 1058 factor *= NINDIR(fs); 1059 last = lastbn; 1060 if (lastbn > 0) 1061 last /= factor; 1062 nblocks = btodb(fs->fs_bsize); 1063 /* 1064 * Get buffer of block pointers, zero those 1065 * entries corresponding to blocks to be free'd, 1066 * and update on disk copy first. 1067 * *Unless* the root pointer has been synchronously 1068 * written to disk. If nothing points to this 1069 * indirect block then don't bother zero'ing and 1070 * writing it. 1071 */ 1072 bp = UFS_BREAD(ufsvfsp, 1073 ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize); 1074 if (bp->b_flags & B_ERROR) { 1075 brelse(bp); 1076 return (0); 1077 } 1078 bap = bp->b_un.b_daddr; 1079 if ((flags & I_CHEAP) == 0) { 1080 uint_t zb; 1081 1082 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t)); 1083 1084 if (zb) { 1085 /* 1086 * push any data into the log before we zero it 1087 */ 1088 if (bp->b_flags & B_DELWRI) 1089 TRANS_LOG(ufsvfsp, (caddr_t)bap, 1090 ldbtob(bp->b_blkno), bp->b_bcount, 1091 bp->b_un.b_addr, bp->b_bcount); 1092 copy = ngeteblk(fs->fs_bsize); 1093 bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr, 1094 (uint_t)fs->fs_bsize); 1095 bzero((caddr_t)&bap[last + 1], zb); 1096 1097 TRANS_BUF(ufsvfsp, 1098 (caddr_t)&bap[last + 1] - (caddr_t)bap, 1099 zb, bp, DT_ABZERO); 1100 1101 UFS_BRWRITE(ufsvfsp, bp); 1102 bp = copy, bap = bp->b_un.b_daddr; 1103 } 1104 } else { 1105 /* make sure write retries are also cleared */ 1106 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI); 1107 bp->b_flags |= B_STALE | B_AGE; 1108 } 1109 1110 /* 1111 * Recursively free totally unused blocks. 1112 */ 1113 flags |= I_CHEAP; 1114 for (i = NINDIR(fs) - 1; i > last; i--) { 1115 nb = bap[i]; 1116 if (nb == 0) 1117 continue; 1118 if (level > SINGLE) { 1119 blocksreleased += 1120 indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags); 1121 free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK); 1122 } else 1123 free(ip, nb, (off_t)fs->fs_bsize, flags); 1124 blocksreleased += nblocks; 1125 } 1126 flags &= ~I_CHEAP; 1127 1128 /* 1129 * Recursively free last partial block. 1130 */ 1131 if (level > SINGLE && lastbn >= 0) { 1132 last = lastbn % factor; 1133 nb = bap[i]; 1134 if (nb != 0) 1135 blocksreleased += indirtrunc(ip, nb, last, level - 1, 1136 flags); 1137 } 1138 brelse(bp); 1139 return (blocksreleased); 1140 } 1141 1142 /* 1143 * Truncate the inode ip to at most length size. 1144 * Free affected disk blocks -- the blocks of the 1145 * file are removed in reverse order. 1146 * 1147 * N.B.: triple indirect blocks are untested. 1148 */ 1149 static int i_genrand = 1234; 1150 int 1151 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr) 1152 { 1153 struct fs *fs = oip->i_fs; 1154 struct ufsvfs *ufsvfsp = oip->i_ufsvfs; 1155 struct inode *ip; 1156 daddr_t lastblock; 1157 off_t bsize; 1158 int boff; 1159 daddr_t bn, lastiblock[NIADDR]; 1160 int level; 1161 long nblocks, blocksreleased = 0; 1162 int i; 1163 ushort_t mode; 1164 struct inode tip; 1165 int err; 1166 u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ? 1167 (UFS_MAXOFFSET_T) : (MAXOFF32_T); 1168 1169 /* 1170 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most 1171 * other uses need the reader lock. opendq() holds the writer lock. 1172 */ 1173 ASSERT((oip->i_mode & IFMT) == IFSHAD || 1174 RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 1175 ASSERT(RW_WRITE_HELD(&oip->i_contents)); 1176 /* 1177 * We only allow truncation of regular files and directories 1178 * to arbitrary lengths here. In addition, we allow symbolic 1179 * links to be truncated only to zero length. Other inode 1180 * types cannot have their length set here. Disk blocks are 1181 * being dealt with - especially device inodes where 1182 * ip->i_ordev is actually being stored in ip->i_db[0]! 1183 */ 1184 TRANS_INODE(ufsvfsp, oip); 1185 mode = oip->i_mode & IFMT; 1186 if (flags & I_FREE) { 1187 i_genrand *= 16843009; /* turns into shift and adds */ 1188 i_genrand++; 1189 oip->i_gen += ((i_genrand + lbolt) & 0xffff) + 1; 1190 oip->i_flag |= ICHG |IUPD; 1191 oip->i_seq++; 1192 if (length == oip->i_size) 1193 return (0); 1194 flags |= I_CHEAP; 1195 } 1196 if (mode == IFIFO) 1197 return (0); 1198 if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR && 1199 !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD) 1200 return (EINVAL); 1201 if (length > maxoffset) 1202 return (EFBIG); 1203 if ((mode == IFDIR) || (mode == IFATTRDIR)) 1204 flags |= I_DIR; 1205 if (mode == IFSHAD) 1206 flags |= I_SHAD; 1207 if (oip == ufsvfsp->vfs_qinod) 1208 flags |= I_QUOTA; 1209 if (length == oip->i_size) { 1210 /* update ctime and mtime to please POSIX tests */ 1211 oip->i_flag |= ICHG |IUPD; 1212 oip->i_seq++; 1213 if (length == 0) { 1214 /* nothing to cache so clear the flag */ 1215 oip->i_flag &= ~IFASTSYMLNK; 1216 } 1217 return (0); 1218 } 1219 /* wipe out fast symlink till next access */ 1220 if (oip->i_flag & IFASTSYMLNK) { 1221 int j; 1222 1223 ASSERT(ITOV(oip)->v_type == VLNK); 1224 1225 oip->i_flag &= ~IFASTSYMLNK; 1226 1227 for (j = 1; j < NDADDR; j++) 1228 oip->i_db[j] = 0; 1229 for (j = 0; j < NIADDR; j++) 1230 oip->i_ib[j] = 0; 1231 } 1232 1233 boff = (int)blkoff(fs, length); 1234 1235 if (length > oip->i_size) { 1236 /* 1237 * Trunc up case. BMAPALLOC will insure that the right blocks 1238 * are allocated. This includes extending the old frag to a 1239 * full block (if needed) in addition to doing any work 1240 * needed for allocating the last block. 1241 */ 1242 if (boff == 0) 1243 err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr); 1244 else 1245 err = BMAPALLOC(oip, length - 1, boff, cr); 1246 1247 if (err == 0) { 1248 /* 1249 * Save old size and set inode's size now 1250 * so that we don't cause too much of the 1251 * file to be zero'd and pushed. 1252 */ 1253 u_offset_t osize = oip->i_size; 1254 oip->i_size = length; 1255 /* 1256 * Make sure we zero out the remaining bytes of 1257 * the page in case a mmap scribbled on it. We 1258 * can't prevent a mmap from writing beyond EOF 1259 * on the last page of a file. 1260 * 1261 */ 1262 if ((boff = (int)blkoff(fs, osize)) != 0) { 1263 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ? 1264 fs->fs_bsize : fragroundup(fs, boff); 1265 pvn_vpzero(ITOV(oip), osize, 1266 (size_t)(bsize - boff)); 1267 } 1268 oip->i_flag |= ICHG|IATTCHG; 1269 oip->i_seq++; 1270 ITIMES_NOLOCK(oip); 1271 /* 1272 * MAXOFF32_T is old 2GB size limit. If 1273 * this operation caused a large file to be 1274 * created, turn on the superblock flag 1275 * and update the superblock, if the flag 1276 * is not already on. 1277 */ 1278 if ((length > (u_offset_t)MAXOFF32_T) && 1279 !(fs->fs_flags & FSLARGEFILES)) { 1280 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1281 mutex_enter(&ufsvfsp->vfs_lock); 1282 fs->fs_flags |= FSLARGEFILES; 1283 ufs_sbwrite(ufsvfsp); 1284 mutex_exit(&ufsvfsp->vfs_lock); 1285 } 1286 } 1287 1288 return (err); 1289 } 1290 1291 /* 1292 * Update the pages of the file. If the file is not being 1293 * truncated to a block boundary, the contents of the 1294 * pages following the end of the file must be zero'ed 1295 * in case it ever become accessible again because 1296 * of subsequent file growth. 1297 */ 1298 if (boff == 0) { 1299 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage, 1300 B_INVAL | B_TRUNC, CRED()); 1301 } else { 1302 /* 1303 * Make sure that the last block is properly allocated. 1304 * We only really have to do this if the last block is 1305 * actually allocated since ufs_bmap will now handle the case 1306 * of an fragment which has no block allocated. Just to 1307 * be sure, we do it now independent of current allocation. 1308 */ 1309 err = BMAPALLOC(oip, length - 1, boff, cr); 1310 if (err) 1311 return (err); 1312 1313 /* 1314 * BMAPALLOC will call bmap_write which defers i_seq 1315 * processing. If the timestamps were changed, update 1316 * i_seq before rdip drops i_contents or syncs the inode. 1317 */ 1318 if (oip->i_flag & (ICHG|IUPD)) 1319 oip->i_seq++; 1320 1321 /* 1322 * BugId 4069932 1323 * Make sure that the relevant partial page appears in 1324 * the v_pages list, so that pvn_vpzero() will do its 1325 * job. Since doing this correctly requires everything 1326 * in rdip() except for the uiomove(), it's easier and 1327 * safer to do the uiomove() rather than duplicate the 1328 * rest of rdip() here. 1329 * 1330 * To get here, we know that length indicates a byte 1331 * that is not the first byte of a block. (length - 1) 1332 * is the last actual byte known to exist. Deduction 1333 * shows it is in the same block as byte (length). 1334 * Thus, this rdip() invocation should always succeed 1335 * except in the face of i/o errors, and give us the 1336 * block we care about. 1337 * 1338 * rdip() makes the same locking assertions and 1339 * assumptions as we do. We do not acquire any locks 1340 * before calling it, so we have not changed the locking 1341 * situation. Finally, there do not appear to be any 1342 * paths whereby rdip() ends up invoking us again. 1343 * Thus, infinite recursion is avoided. 1344 */ 1345 { 1346 uio_t uio; 1347 iovec_t iov[1]; 1348 char buffer; 1349 1350 uio.uio_iov = iov; 1351 uio.uio_iovcnt = 1; 1352 uio.uio_loffset = length - 1; 1353 uio.uio_resid = 1; 1354 uio.uio_segflg = UIO_SYSSPACE; 1355 uio.uio_extflg = UIO_COPY_CACHED; 1356 1357 iov[0].iov_base = &buffer; 1358 iov[0].iov_len = 1; 1359 1360 err = rdip(oip, &uio, UIO_READ, NULL); 1361 if (err) 1362 return (err); 1363 } 1364 1365 bsize = (int)lblkno(fs, length - 1) >= NDADDR ? 1366 fs->fs_bsize : fragroundup(fs, boff); 1367 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff)); 1368 /* 1369 * Ensure full fs block is marked as dirty. 1370 */ 1371 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff), 1372 ufs_putapage, B_INVAL | B_TRUNC, CRED()); 1373 } 1374 1375 /* 1376 * Calculate index into inode's block list of 1377 * last direct and indirect blocks (if any) 1378 * which we want to keep. Lastblock is -1 when 1379 * the file is truncated to 0. 1380 */ 1381 lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; 1382 lastiblock[SINGLE] = lastblock - NDADDR; 1383 lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); 1384 lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); 1385 nblocks = btodb(fs->fs_bsize); 1386 1387 /* 1388 * Update file and block pointers 1389 * on disk before we start freeing blocks. 1390 * If we crash before free'ing blocks below, 1391 * the blocks will be returned to the free list. 1392 * lastiblock values are also normalized to -1 1393 * for calls to indirtrunc below. 1394 */ 1395 tip = *oip; /* structure copy */ 1396 ip = &tip; 1397 1398 for (level = TRIPLE; level >= SINGLE; level--) 1399 if (lastiblock[level] < 0) { 1400 oip->i_ib[level] = 0; 1401 lastiblock[level] = -1; 1402 } 1403 for (i = NDADDR - 1; i > lastblock; i--) { 1404 oip->i_db[i] = 0; 1405 flags |= I_CHEAP; 1406 } 1407 oip->i_size = length; 1408 oip->i_flag |= ICHG|IUPD|IATTCHG; 1409 oip->i_seq++; 1410 if (!TRANS_ISTRANS(ufsvfsp)) 1411 ufs_iupdat(oip, I_SYNC); /* do sync inode update */ 1412 1413 /* 1414 * Indirect blocks first. 1415 */ 1416 for (level = TRIPLE; level >= SINGLE; level--) { 1417 bn = ip->i_ib[level]; 1418 if (bn != 0) { 1419 blocksreleased += 1420 indirtrunc(ip, bn, lastiblock[level], level, flags); 1421 if (lastiblock[level] < 0) { 1422 ip->i_ib[level] = 0; 1423 free(ip, bn, (off_t)fs->fs_bsize, 1424 flags | I_IBLK); 1425 blocksreleased += nblocks; 1426 } 1427 } 1428 if (lastiblock[level] >= 0) 1429 goto done; 1430 } 1431 1432 /* 1433 * All whole direct blocks or frags. 1434 */ 1435 for (i = NDADDR - 1; i > lastblock; i--) { 1436 bn = ip->i_db[i]; 1437 if (bn == 0) 1438 continue; 1439 ip->i_db[i] = 0; 1440 bsize = (off_t)blksize(fs, ip, i); 1441 free(ip, bn, bsize, flags); 1442 blocksreleased += btodb(bsize); 1443 } 1444 if (lastblock < 0) 1445 goto done; 1446 1447 /* 1448 * Finally, look for a change in size of the 1449 * last direct block; release any frags. 1450 */ 1451 bn = ip->i_db[lastblock]; 1452 if (bn != 0) { 1453 off_t oldspace, newspace; 1454 1455 /* 1456 * Calculate amount of space we're giving 1457 * back as old block size minus new block size. 1458 */ 1459 oldspace = blksize(fs, ip, lastblock); 1460 UFS_SET_ISIZE(length, ip); 1461 newspace = blksize(fs, ip, lastblock); 1462 if (newspace == 0) { 1463 err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0"); 1464 return (err); 1465 } 1466 if (oldspace - newspace > 0) { 1467 /* 1468 * Block number of space to be free'd is 1469 * the old block # plus the number of frags 1470 * required for the storage we're keeping. 1471 */ 1472 bn += numfrags(fs, newspace); 1473 free(ip, bn, oldspace - newspace, flags); 1474 blocksreleased += btodb(oldspace - newspace); 1475 } 1476 } 1477 done: 1478 /* BEGIN PARANOIA */ 1479 for (level = SINGLE; level <= TRIPLE; level++) 1480 if (ip->i_ib[level] != oip->i_ib[level]) { 1481 err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block"); 1482 return (err); 1483 } 1484 1485 for (i = 0; i < NDADDR; i++) 1486 if (ip->i_db[i] != oip->i_db[i]) { 1487 err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block"); 1488 return (err); 1489 } 1490 /* END PARANOIA */ 1491 oip->i_blocks -= blocksreleased; 1492 1493 if (oip->i_blocks < 0) { /* sanity */ 1494 cmn_err(CE_NOTE, 1495 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n", 1496 fs->fs_fsmnt, (int)oip->i_number, oip->i_size, 1497 (int)oip->i_blocks); 1498 oip->i_blocks = 0; 1499 } 1500 oip->i_flag |= ICHG|IATTCHG; 1501 oip->i_seq++; 1502 /* blocksreleased is >= zero, so this can not fail */ 1503 (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL, 1504 (size_t *)NULL); 1505 return (0); 1506 } 1507 1508 /* 1509 * Check mode permission on inode. Mode is READ, WRITE or EXEC. 1510 * In the case of WRITE, the read-only status of the file system 1511 * is checked. Depending on the calling user, the appropriate 1512 * mode bits are selected; privileges to override missing permission 1513 * bits are checked through secpolicy_vnode_access(). 1514 */ 1515 int 1516 ufs_iaccess(void *vip, int mode, struct cred *cr) 1517 { 1518 struct inode *ip = vip; 1519 int shift = 0; 1520 1521 if (mode & IWRITE) { 1522 /* 1523 * Disallow write attempts on read-only 1524 * file systems, unless the file is a block 1525 * or character device or a FIFO. 1526 */ 1527 if (ip->i_fs->fs_ronly != 0) { 1528 if ((ip->i_mode & IFMT) != IFCHR && 1529 (ip->i_mode & IFMT) != IFBLK && 1530 (ip->i_mode & IFMT) != IFIFO) { 1531 return (EROFS); 1532 } 1533 } 1534 } 1535 /* 1536 * If there is a shadow inode check for the presence of an acl, 1537 * if the acl is there use the ufs_acl_access routine to check 1538 * the acl 1539 */ 1540 if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) 1541 return (ufs_acl_access(ip, mode, cr)); 1542 1543 /* 1544 * Access check is based on only 1545 * one of owner, group, public. 1546 * If not owner, then check group. 1547 * If not a member of the group, then 1548 * check public access. 1549 */ 1550 if (crgetuid(cr) != ip->i_uid) { 1551 shift += 3; 1552 if (!groupmember((uid_t)ip->i_gid, cr)) 1553 shift += 3; 1554 } 1555 1556 mode &= ~(ip->i_mode << shift); 1557 1558 if (mode == 0) 1559 return (0); 1560 1561 /* test missing privilege bits */ 1562 return (secpolicy_vnode_access(cr, ITOV(ip), ip->i_uid, mode)); 1563 } 1564 1565 /* 1566 * if necessary, remove an inode from the free list 1567 * i_contents is held except at unmount 1568 * 1569 * Return 1 if the inode is taken off of the ufs_idle_q, 1570 * and the caller is expected to call VN_RELE. 1571 * 1572 * Return 0 otherwise. 1573 */ 1574 int 1575 ufs_rmidle(struct inode *ip) 1576 { 1577 int rval = 0; 1578 1579 mutex_enter(&ip->i_tlock); 1580 if ((ip->i_flag & IREF) == 0) { 1581 mutex_enter(&ufs_idle_q.uq_mutex); 1582 ip->i_freef->i_freeb = ip->i_freeb; 1583 ip->i_freeb->i_freef = ip->i_freef; 1584 ip->i_freef = ip; 1585 ip->i_freeb = ip; 1586 ip->i_flag |= IREF; 1587 ufs_idle_q.uq_ne--; 1588 if (ip->i_flag & IJUNKIQ) { 1589 ufs_njunk_iq--; 1590 ip->i_flag &= ~IJUNKIQ; 1591 } else { 1592 ufs_nuseful_iq--; 1593 } 1594 mutex_exit(&ufs_idle_q.uq_mutex); 1595 rval = 1; 1596 } 1597 mutex_exit(&ip->i_tlock); 1598 return (rval); 1599 } 1600 1601 /* 1602 * scan the hash of inodes and call func with the inode locked 1603 */ 1604 int 1605 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg, 1606 struct ufsvfs *ufsvfsp) 1607 { 1608 struct inode *ip; /* current inode */ 1609 struct inode *lip = NULL; /* last/previous inode */ 1610 union ihead *ih; /* current hash chain */ 1611 int error, i; 1612 int saverror = 0; 1613 int lip_held; /* lip needs a VN_RELE() */ 1614 1615 /* 1616 * If ufsvfsp is NULL, then our caller should be holding 1617 * ufs_scan_lock to avoid conflicts between ufs_unmount() and 1618 * ufs_update(). Otherwise, to avoid false-positives in 1619 * ufs_unmount()'s v_count-based EBUSY check, we only hold 1620 * those inodes that are in the file system our caller cares 1621 * about. 1622 * 1623 * We know that ip is a valid inode in the hash chain (and thus 1624 * we can trust i_ufsvfs) because the inode we chained from 1625 * (lip) is still in the hash chain. This is true because either: 1626 * 1627 * 1. We did not drop the hash chain lock since the last 1628 * iteration (because we were not interested in the last inode), 1629 * or 1630 * 2. We maintained a hold on the last inode while we 1631 * we were processing it, so it could not be removed 1632 * from the hash chain. 1633 * 1634 * The whole reason we're dropping and re-grabbing the chain 1635 * lock on every inode is so that we don't present a major 1636 * choke point on throughput, particularly when we've been 1637 * called on behalf of fsflush. 1638 */ 1639 1640 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 1641 mutex_enter(&ih_lock[i]); 1642 for (ip = ih->ih_chain[0], lip_held = 0; 1643 ip != (struct inode *)ih; 1644 ip = lip->i_forw) { 1645 1646 ins.in_scan.value.ul++; 1647 1648 /* 1649 * Undo the previous iteration's VN_HOLD(), but 1650 * only if one was done. 1651 */ 1652 if (lip_held) 1653 VN_RELE(ITOV(lip)); 1654 1655 lip = ip; 1656 if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) { 1657 /* 1658 * We're not processing all inodes, and 1659 * this inode is not in the filesystem of 1660 * interest, so skip it. No need to do a 1661 * VN_HOLD() since we're not dropping the 1662 * hash chain lock until after we've 1663 * done the i_forw traversal above. 1664 */ 1665 lip_held = 0; 1666 continue; 1667 } 1668 VN_HOLD(ITOV(ip)); 1669 lip_held = 1; 1670 mutex_exit(&ih_lock[i]); 1671 1672 /* 1673 * Acquire the contents lock as writer to make 1674 * sure that the inode has been initialized in 1675 * the cache or removed from the idle list by 1676 * ufs_iget(). This works because ufs_iget() 1677 * acquires the contents lock before putting 1678 * the inode into the cache. If we can lock 1679 * it, then he's done with it. 1680 */ 1681 1682 if (rwtry) { 1683 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) { 1684 mutex_enter(&ih_lock[i]); 1685 continue; 1686 } 1687 } else { 1688 rw_enter(&ip->i_contents, RW_WRITER); 1689 } 1690 1691 rw_exit(&ip->i_contents); 1692 1693 /* 1694 * ISTALE means the inode couldn't be read 1695 * 1696 * We don't have to hold the i_contents lock 1697 * for this check for a couple of 1698 * reasons. First, if ISTALE is set then the 1699 * flag cannot be cleared until the inode is 1700 * removed from the cache and that cannot 1701 * happen until after we VN_RELE() it. 1702 * Second, if ISTALE is not set, then the 1703 * inode is in the cache and does not need to 1704 * be read from disk so ISTALE cannot be set 1705 * while we are not looking. 1706 */ 1707 if ((ip->i_flag & ISTALE) == 0) { 1708 if ((error = (*func)(ip, arg)) != 0) 1709 saverror = error; 1710 } 1711 1712 mutex_enter(&ih_lock[i]); 1713 } 1714 if (lip_held) 1715 VN_RELE(ITOV(lip)); 1716 mutex_exit(&ih_lock[i]); 1717 } 1718 return (saverror); 1719 } 1720 1721 /* 1722 * Mark inode with the current time, plus a unique increment. 1723 * 1724 * Since we only keep 32-bit time on disk, if UFS is still alive 1725 * beyond 2038, filesystem times will simply stick at the last 1726 * possible second of 32-bit time. Not ideal, but probably better 1727 * than going into the remote past, or confusing applications with 1728 * negative time. 1729 */ 1730 void 1731 ufs_imark(struct inode *ip) 1732 { 1733 timestruc_t now; 1734 int32_t usec, nsec; 1735 1736 /* 1737 * The update of i_seq may have been deferred, increase i_seq here 1738 * to make sure it is in sync with the timestamps. 1739 */ 1740 if (ip->i_flag & ISEQ) { 1741 ASSERT(ip->i_flag & (IUPD|ICHG)); 1742 ip->i_seq++; 1743 ip->i_flag &= ~ISEQ; 1744 } 1745 1746 gethrestime(&now); 1747 1748 /* 1749 * Fast algorithm to convert nsec to usec -- see hrt2ts() 1750 * in common/os/timers.c for a full description. 1751 */ 1752 nsec = now.tv_nsec; 1753 usec = nsec + (nsec >> 2); 1754 usec = nsec + (usec >> 1); 1755 usec = nsec + (usec >> 2); 1756 usec = nsec + (usec >> 4); 1757 usec = nsec - (usec >> 3); 1758 usec = nsec + (usec >> 2); 1759 usec = nsec + (usec >> 3); 1760 usec = nsec + (usec >> 4); 1761 usec = nsec + (usec >> 1); 1762 usec = nsec + (usec >> 6); 1763 usec = usec >> 10; 1764 1765 mutex_enter(&ufs_iuniqtime_lock); 1766 if (now.tv_sec > (time_t)iuniqtime.tv_sec || 1767 usec > iuniqtime.tv_usec) { 1768 if (now.tv_sec < TIME32_MAX) { 1769 iuniqtime.tv_sec = (time32_t)now.tv_sec; 1770 iuniqtime.tv_usec = usec; 1771 } 1772 } else { 1773 if (iuniqtime.tv_sec < TIME32_MAX) { 1774 iuniqtime.tv_usec++; 1775 /* Check for usec overflow */ 1776 if (iuniqtime.tv_usec >= MICROSEC) { 1777 iuniqtime.tv_sec++; 1778 iuniqtime.tv_usec = 0; 1779 } 1780 } 1781 } 1782 1783 if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) { 1784 ip->i_atime = iuniqtime; 1785 } 1786 if (ip->i_flag & IUPD) { 1787 ip->i_mtime = iuniqtime; 1788 ip->i_flag |= IMODTIME; 1789 } 1790 if (ip->i_flag & ICHG) { 1791 ip->i_diroff = 0; 1792 ip->i_ctime = iuniqtime; 1793 } 1794 mutex_exit(&ufs_iuniqtime_lock); 1795 } 1796 1797 /* 1798 * Update timestamps in inode. 1799 */ 1800 void 1801 ufs_itimes_nolock(struct inode *ip) 1802 { 1803 1804 /* 1805 * if noatime is set and the inode access time is the only field that 1806 * must be changed, exit immediately. 1807 */ 1808 if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) && 1809 (ip->i_ufsvfs->vfs_noatime)) { 1810 return; 1811 } 1812 1813 if (ip->i_flag & (IUPD|IACC|ICHG)) { 1814 if (ip->i_flag & ICHG) 1815 ip->i_flag |= IMOD; 1816 else 1817 ip->i_flag |= IMODACC; 1818 ufs_imark(ip); 1819 ip->i_flag &= ~(IACC|IUPD|ICHG); 1820 } 1821 } 1822