1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 41 #pragma ident "%Z%%M% %I% %E% SMI" 42 43 #include <sys/types.h> 44 #include <sys/t_lock.h> 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/uio.h> 48 #include <sys/bitmap.h> 49 #include <sys/signal.h> 50 #include <sys/cred.h> 51 #include <sys/user.h> 52 #include <sys/vfs.h> 53 #include <sys/stat.h> 54 #include <sys/vnode.h> 55 #include <sys/buf.h> 56 #include <sys/proc.h> 57 #include <sys/disp.h> 58 #include <sys/dnlc.h> 59 #include <sys/mode.h> 60 #include <sys/cmn_err.h> 61 #include <sys/kstat.h> 62 #include <sys/acl.h> 63 #include <sys/var.h> 64 #include <sys/fs/ufs_inode.h> 65 #include <sys/fs/ufs_fs.h> 66 #include <sys/fs/ufs_trans.h> 67 #include <sys/fs/ufs_acl.h> 68 #include <sys/fs/ufs_bio.h> 69 #include <sys/fs/ufs_quota.h> 70 #include <sys/fs/ufs_log.h> 71 #include <vm/hat.h> 72 #include <vm/as.h> 73 #include <vm/pvn.h> 74 #include <vm/seg.h> 75 #include <sys/swap.h> 76 #include <sys/cpuvar.h> 77 #include <sys/sysmacros.h> 78 #include <sys/errno.h> 79 #include <sys/kmem.h> 80 #include <sys/debug.h> 81 #include <fs/fs_subr.h> 82 #include <sys/policy.h> 83 84 struct kmem_cache *inode_cache; /* cache of free inodes */ 85 86 /* UFS Inode Cache Stats -- Not protected */ 87 struct instats ins = { 88 { "size", KSTAT_DATA_ULONG }, 89 { "maxsize", KSTAT_DATA_ULONG }, 90 { "hits", KSTAT_DATA_ULONG }, 91 { "misses", KSTAT_DATA_ULONG }, 92 { "kmem allocs", KSTAT_DATA_ULONG }, 93 { "kmem frees", KSTAT_DATA_ULONG }, 94 { "maxsize reached", KSTAT_DATA_ULONG }, 95 { "puts at frontlist", KSTAT_DATA_ULONG }, 96 { "puts at backlist", KSTAT_DATA_ULONG }, 97 { "queues to free", KSTAT_DATA_ULONG }, 98 { "scans", KSTAT_DATA_ULONG }, 99 { "thread idles", KSTAT_DATA_ULONG }, 100 { "lookup idles", KSTAT_DATA_ULONG }, 101 { "vget idles", KSTAT_DATA_ULONG }, 102 { "cache allocs", KSTAT_DATA_ULONG }, 103 { "cache frees", KSTAT_DATA_ULONG }, 104 { "pushes at close", KSTAT_DATA_ULONG } 105 }; 106 107 /* kstat data */ 108 static kstat_t *ufs_inode_kstat = NULL; 109 110 union ihead *ihead; /* inode LRU cache, Chris Maltby */ 111 kmutex_t *ih_lock; /* protect inode cache hash table */ 112 static int ino_hashlen = 4; /* desired average hash chain length */ 113 int inohsz; /* number of buckets in the hash table */ 114 115 kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */ 116 kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */ 117 kmutex_t ufsvfs_mutex; 118 struct ufsvfs *oldufsvfslist, *ufsvfslist; 119 120 /* 121 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no 122 * I/Os are going on. 123 */ 124 clock_t ufs_iowait; 125 126 /* 127 * the threads that process idle inodes and free (deleted) inodes 128 * have high water marks that are set in ufsinit(). 129 * These values but can be no less then the minimum shown below 130 */ 131 int ufs_idle_max; /* # of allowable idle inodes */ 132 ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */ 133 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */ 134 135 /* 136 * Tunables for ufs write throttling. 137 * These are validated in ufs_iinit() since improper settings 138 * can lead to filesystem hangs. 139 */ 140 #define UFS_HW_DEFAULT (16 * 1024 * 1024) 141 #define UFS_LW_DEFAULT (8 * 1024 * 1024) 142 int ufs_HW = UFS_HW_DEFAULT; 143 int ufs_LW = UFS_LW_DEFAULT; 144 145 static void ihinit(void); 146 extern int hash2ints(int, int); 147 148 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **, 149 struct cred *, int); 150 151 /* ARGSUSED */ 152 static int 153 ufs_inode_kstat_update(kstat_t *ksp, int rw) 154 { 155 if (rw == KSTAT_WRITE) 156 return (EACCES); 157 158 ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 159 "slab_alloc"); 160 ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 161 "slab_free"); 162 ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 163 "alloc"); 164 ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 165 "free"); 166 ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 167 "buf_inuse"); 168 ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 169 "buf_max"); 170 ins.in_misses.value.ul = ins.in_kcalloc.value.ul; 171 172 return (0); 173 } 174 175 void 176 ufs_iinit(void) 177 { 178 /* 179 * Validate that ufs_HW > ufs_LW. 180 * The default values for these two tunables have been increased. 181 * There is now a range of values for ufs_HW that used to be 182 * legal on previous Solaris versions but no longer is now. 183 * Upgrading a machine which has an /etc/system setting for ufs_HW 184 * from that range can lead to filesystem hangs unless the values 185 * are checked here. 186 */ 187 if (ufs_HW <= ufs_LW) { 188 cmn_err(CE_WARN, 189 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.", 190 ufs_HW, ufs_LW); 191 ufs_LW = UFS_LW_DEFAULT; 192 ufs_HW = UFS_HW_DEFAULT; 193 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n", 194 ufs_HW, ufs_LW); 195 } 196 197 /* 198 * Adjust the tunable `ufs_ninode' to a reasonable value 199 */ 200 if (ufs_ninode <= 0) 201 ufs_ninode = ncsize; 202 if (ufs_inode_max == 0) 203 ufs_inode_max = (ulong_t)((kmem_maxavail() >> 2) / 204 sizeof (struct inode)); 205 if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) { 206 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld", 207 ufs_inode_max); 208 ufs_ninode = ufs_inode_max; 209 } 210 /* 211 * Wait till third call of ufs_update to declare that no I/Os are 212 * going on. This allows deferred access times to be flushed to disk. 213 */ 214 ufs_iowait = v.v_autoup * hz * 2; 215 216 /* 217 * idle thread runs when 25% of ufs_ninode entries are on the queue 218 */ 219 if (ufs_idle_max == 0) 220 ufs_idle_max = ufs_ninode >> 2; 221 if (ufs_idle_max < UFS_IDLE_MAX) 222 ufs_idle_max = UFS_IDLE_MAX; 223 if (ufs_idle_max > ufs_ninode) 224 ufs_idle_max = ufs_ninode; 225 /* 226 * This is really a misnomer, it is ufs_queue_init 227 */ 228 ufs_thread_init(&ufs_idle_q, ufs_idle_max); 229 ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL); 230 231 /* 232 * global hlock thread 233 */ 234 ufs_thread_init(&ufs_hlock, 1); 235 ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL); 236 237 ihinit(); 238 qtinit(); 239 ins.in_maxsize.value.ul = ufs_ninode; 240 if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs", 241 KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t), 242 KSTAT_FLAG_VIRTUAL)) != NULL) { 243 ufs_inode_kstat->ks_data = (void *)&ins; 244 ufs_inode_kstat->ks_update = ufs_inode_kstat_update; 245 kstat_install(ufs_inode_kstat); 246 } 247 ufsfx_init(); /* fix-on-panic initialization */ 248 si_cache_init(); 249 ufs_directio_init(); 250 lufs_init(); 251 mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL); 252 } 253 254 /* ARGSUSED */ 255 static int 256 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags) 257 { 258 struct inode *ip = buf; 259 struct vnode *vp; 260 261 rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL); 262 rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL); 263 mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL); 264 dnlc_dir_init(&ip->i_danchor); 265 266 cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL); 267 268 vp = vn_alloc(KM_SLEEP); 269 ip->i_vnode = vp; 270 271 vn_setops(vp, ufs_vnodeops); 272 vp->v_data = (caddr_t)ip; 273 274 return (0); 275 } 276 277 /* ARGSUSED */ 278 static void 279 ufs_inode_cache_destructor(void *buf, void *cdrarg) 280 { 281 struct inode *ip = buf; 282 struct vnode *vp; 283 284 vp = ITOV(ip); 285 286 rw_destroy(&ip->i_rwlock); 287 rw_destroy(&ip->i_contents); 288 289 mutex_destroy(&ip->i_tlock); 290 if (vp->v_type == VDIR) { 291 dnlc_dir_fini(&ip->i_danchor); 292 } 293 294 cv_destroy(&ip->i_wrcv); 295 296 vn_free(vp); 297 } 298 299 /* 300 * Initialize hash links for inodes 301 * and build inode free list. 302 */ 303 void 304 ihinit(void) 305 { 306 int i; 307 union ihead *ih = ihead; 308 309 mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL); 310 311 inohsz = 1 << highbit(ufs_ninode / ino_hashlen); 312 ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP); 313 ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP); 314 315 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 316 ih->ih_head[0] = ih; 317 ih->ih_head[1] = ih; 318 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL); 319 } 320 inode_cache = kmem_cache_create("ufs_inode_cache", 321 sizeof (struct inode), 0, ufs_inode_cache_constructor, 322 ufs_inode_cache_destructor, ufs_inode_cache_reclaim, 323 NULL, NULL, 0); 324 } 325 326 /* 327 * Free an inode structure 328 */ 329 void 330 ufs_free_inode(struct inode *ip) 331 { 332 vn_invalid(ITOV(ip)); 333 kmem_cache_free(inode_cache, ip); 334 } 335 336 /* 337 * Allocate an inode structure 338 */ 339 struct inode * 340 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino) 341 { 342 struct inode *ip; 343 vnode_t *vp; 344 345 ip = kmem_cache_alloc(inode_cache, KM_SLEEP); 346 /* 347 * at this point we have a newly allocated inode 348 */ 349 ip->i_freef = ip; 350 ip->i_freeb = ip; 351 ip->i_flag = IREF; 352 ip->i_seq = 0xFF; /* Unique initial value */ 353 ip->i_dev = ufsvfsp->vfs_dev; 354 ip->i_ufsvfs = ufsvfsp; 355 ip->i_devvp = ufsvfsp->vfs_devvp; 356 ip->i_number = ino; 357 ip->i_diroff = 0; 358 ip->i_nextr = 0; 359 ip->i_map = NULL; 360 ip->i_rdev = 0; 361 ip->i_writes = 0; 362 ip->i_mode = 0; 363 ip->i_delaylen = 0; 364 ip->i_delayoff = 0; 365 ip->i_nextrio = 0; 366 ip->i_ufs_acl = NULL; 367 ip->i_cflags = 0; 368 ip->i_mapcnt = 0; 369 ip->i_dquot = NULL; 370 ip->i_cachedir = CD_ENABLED; 371 ip->i_writer = NULL; 372 373 /* 374 * the vnode for this inode was allocated by the constructor 375 */ 376 vp = ITOV(ip); 377 vn_reinit(vp); 378 if (ino == (ino_t)UFSROOTINO) 379 vp->v_flag = VROOT; 380 vp->v_vfsp = ufsvfsp->vfs_vfs; 381 vn_exists(vp); 382 return (ip); 383 } 384 385 /* 386 * Look up an inode by device, inumber. If it is in core (in the 387 * inode structure), honor the locking protocol. If it is not in 388 * core, read it in from the specified device after freeing any pages. 389 * In all cases, a pointer to a VN_HELD inode structure is returned. 390 */ 391 int 392 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr) 393 { 394 return (ufs_iget_internal(vfsp, ino, ipp, cr, 0)); 395 } 396 397 /* 398 * A version of ufs_iget which returns only allocated, linked inodes. 399 * This is appropriate for any callers who do not expect a free inode. 400 */ 401 int 402 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp, 403 struct cred *cr) 404 { 405 return (ufs_iget_internal(vfsp, ino, ipp, cr, 1)); 406 } 407 408 /* 409 * Set vnode attributes based on v_type, this should be called whenever 410 * an inode's i_mode is changed. 411 */ 412 void 413 ufs_reset_vnode(vnode_t *vp) 414 { 415 /* 416 * an old DBE hack 417 */ 418 if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) 419 vp->v_flag |= VSWAPLIKE; 420 else 421 vp->v_flag &= ~VSWAPLIKE; 422 423 /* 424 * if not swap like and it's just a regular file, we want 425 * to maintain the vnode's pages sorted by clean/modified 426 * for faster sync'ing to disk 427 */ 428 if (vp->v_type == VREG) 429 vp->v_flag |= VMODSORT; 430 else 431 vp->v_flag &= ~VMODSORT; 432 433 /* 434 * Is this an attribute hidden dir? 435 */ 436 if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR) 437 vp->v_flag |= V_XATTRDIR; 438 else 439 vp->v_flag &= ~V_XATTRDIR; 440 } 441 442 /* 443 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate' 444 * flag is used to distinguish the two; when true, we validate that the inode 445 * being retrieved looks like a linked and allocated inode. 446 */ 447 /* ARGSUSED */ 448 static int 449 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp, 450 struct cred *cr, int validate) 451 { 452 struct inode *ip, *sp; 453 union ihead *ih; 454 kmutex_t *ihm; 455 struct buf *bp; 456 struct dinode *dp; 457 struct vnode *vp; 458 extern vfs_t EIO_vfs; 459 int error; 460 int ftype; /* XXX - Remove later on */ 461 dev_t vfs_dev; 462 struct ufsvfs *ufsvfsp; 463 struct fs *fs; 464 int hno; 465 daddr_t bno; 466 ulong_t ioff; 467 468 CPU_STATS_ADD_K(sys, ufsiget, 1); 469 470 /* 471 * Lookup inode in cache. 472 */ 473 vfs_dev = vfsp->vfs_dev; 474 hno = INOHASH(ino); 475 ih = &ihead[hno]; 476 ihm = &ih_lock[hno]; 477 478 again: 479 mutex_enter(ihm); 480 for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) { 481 if (ino != ip->i_number || vfs_dev != ip->i_dev || 482 (ip->i_flag & ISTALE)) 483 continue; 484 485 /* 486 * Found the interesting inode; hold it and drop the cache lock 487 */ 488 vp = ITOV(ip); /* for locknest */ 489 VN_HOLD(vp); 490 mutex_exit(ihm); 491 rw_enter(&ip->i_contents, RW_READER); 492 493 /* 494 * if necessary, remove from idle list 495 */ 496 if ((ip->i_flag & IREF) == 0) { 497 if (ufs_rmidle(ip)) 498 VN_RELE(vp); 499 } 500 501 /* 502 * Could the inode be read from disk? 503 */ 504 if (ip->i_flag & ISTALE) { 505 rw_exit(&ip->i_contents); 506 VN_RELE(vp); 507 goto again; 508 } 509 510 ins.in_hits.value.ul++; 511 *ipp = ip; 512 513 /* 514 * Reset the vnode's attribute flags 515 */ 516 mutex_enter(&vp->v_lock); 517 ufs_reset_vnode(vp); 518 mutex_exit(&vp->v_lock); 519 520 rw_exit(&ip->i_contents); 521 522 return (0); 523 } 524 mutex_exit(ihm); 525 526 /* 527 * Inode was not in cache. 528 * 529 * Allocate a new entry 530 */ 531 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 532 fs = ufsvfsp->vfs_fs; 533 534 ip = ufs_alloc_inode(ufsvfsp, ino); 535 vp = ITOV(ip); 536 537 bno = fsbtodb(fs, itod(fs, ino)); 538 ioff = (sizeof (struct dinode)) * (itoo(fs, ino)); 539 ip->i_doff = (offset_t)ioff + ldbtob(bno); 540 541 /* 542 * put a place holder in the cache (if not already there) 543 */ 544 mutex_enter(ihm); 545 for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw) 546 if (ino == sp->i_number && vfs_dev == sp->i_dev && 547 ((sp->i_flag & ISTALE) == 0)) { 548 mutex_exit(ihm); 549 ufs_free_inode(ip); 550 goto again; 551 } 552 /* 553 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock)) 554 * here, but if we do, then shadow inode allocations panic the 555 * system. We don't have to hold vfs_dqrwlock for shadow inodes 556 * and the ufs_iget() parameters don't tell us what we are getting 557 * so we have no way of knowing this is a ufs_iget() call from 558 * a ufs_ialloc() call for a shadow inode. 559 */ 560 rw_enter(&ip->i_contents, RW_WRITER); 561 insque(ip, ih); 562 mutex_exit(ihm); 563 /* 564 * read the dinode 565 */ 566 bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize); 567 568 /* 569 * Check I/O errors 570 */ 571 error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0); 572 if (error) { 573 brelse(bp); 574 ip->i_flag |= ISTALE; /* in case someone is looking it up */ 575 rw_exit(&ip->i_contents); 576 vp->v_vfsp = &EIO_vfs; 577 VN_RELE(vp); 578 return (error); 579 } 580 /* 581 * initialize the inode's dinode 582 */ 583 dp = (struct dinode *)(ioff + bp->b_un.b_addr); 584 ip->i_ic = dp->di_ic; /* structure assignment */ 585 brelse(bp); 586 587 /* 588 * Maintain compatibility with Solaris 1.x UFS 589 */ 590 if (ip->i_suid != UID_LONG) 591 ip->i_uid = ip->i_suid; 592 if (ip->i_sgid != GID_LONG) 593 ip->i_gid = ip->i_sgid; 594 595 ftype = ip->i_mode & IFMT; 596 if (ftype == IFBLK || ftype == IFCHR) { 597 dev_t dv; 598 uint_t top16 = ip->i_ordev & 0xffff0000u; 599 600 if (top16 == 0 || top16 == 0xffff0000u) 601 dv = expdev(ip->i_ordev); 602 else 603 dv = expldev(ip->i_ordev); 604 vp->v_rdev = ip->i_rdev = dv; 605 } 606 607 /* 608 * if our caller only expects allocated inodes, verify that 609 * this inode looks good; throw it out if it's bad. 610 */ 611 if (validate) { 612 if ((ftype == 0) || (ip->i_nlink <= 0)) { 613 ip->i_flag |= ISTALE; 614 rw_exit(&ip->i_contents); 615 vp->v_vfsp = &EIO_vfs; 616 VN_RELE(vp); 617 cmn_err(CE_NOTE, 618 "%s: unexpected free inode %d, run fsck(1M)%s", 619 fs->fs_fsmnt, (int)ino, 620 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 621 return (EIO); 622 } 623 } 624 625 /* 626 * finish initializing the vnode 627 */ 628 vp->v_type = IFTOVT((mode_t)ip->i_mode); 629 630 ufs_reset_vnode(vp); 631 632 /* 633 * read the shadow 634 */ 635 if (ftype != 0 && ip->i_shadow != 0) { 636 if ((error = ufs_si_load(ip, cr)) != 0) { 637 ip->i_flag |= ISTALE; 638 ip->i_ufs_acl = NULL; 639 rw_exit(&ip->i_contents); 640 vp->v_vfsp = &EIO_vfs; 641 VN_RELE(vp); 642 return (error); 643 } 644 } 645 646 /* 647 * Only attach quota information if the inode has a type and if 648 * that type is not a shadow inode. 649 */ 650 if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) && 651 ((ip->i_mode & IFMT) != IFATTRDIR)) { 652 ip->i_dquot = getinoquota(ip); 653 } 654 TRANS_MATA_IGET(ufsvfsp, ip); 655 *ipp = ip; 656 rw_exit(&ip->i_contents); 657 658 return (0); 659 } 660 661 /* 662 * Vnode is no longer referenced, write the inode out 663 * and if necessary, truncate and deallocate the file. 664 */ 665 void 666 ufs_iinactive(struct inode *ip) 667 { 668 int front; 669 struct inode *iq; 670 struct inode *hip; 671 struct ufs_q *uq; 672 struct vnode *vp = ITOV(ip); 673 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 674 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 675 676 /* 677 * Because the vnode type might have been changed, 678 * the dnlc_dir_purge must be called unconditionally. 679 */ 680 dnlc_dir_purge(&ip->i_danchor); 681 682 /* 683 * Get exclusive access to inode data. 684 */ 685 rw_enter(&ip->i_contents, RW_WRITER); 686 ASSERT(ip->i_flag & IREF); 687 688 /* 689 * Make sure no one reclaimed the inode before we put it on 690 * the freelist or destroy it. We keep our 'hold' on the vnode 691 * from vn_rele until we are ready to do something with the inode. 692 * 693 * Pageout may put a VN_HOLD/VN_RELE at anytime during this 694 * operation via an async putpage, so we must make sure 695 * we don't free/destroy the inode more than once. ufs_iget 696 * may also put a VN_HOLD on the inode before it grabs 697 * the i_contents lock. This is done so we don't free 698 * an inode that a thread is waiting on. 699 */ 700 mutex_enter(&vp->v_lock); 701 702 if (vp->v_count > 1) { 703 vp->v_count--; /* release our hold from vn_rele */ 704 mutex_exit(&vp->v_lock); 705 rw_exit(&ip->i_contents); 706 return; 707 } 708 mutex_exit(&vp->v_lock); 709 710 /* 711 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed 712 * and clean. It can be safely destroyed (cyf). 713 */ 714 if (ip->i_ufsvfs == NULL) { 715 rw_exit(&ip->i_contents); 716 ufs_si_del(ip); 717 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 718 ufs_free_inode(ip); 719 return; 720 } 721 722 /* 723 * queue idle inode to appropriate thread. Will check v_count == 1 724 * prior to putting this on the appropriate queue. 725 * Stale inodes will be unhashed and freed by the ufs idle thread 726 * in ufs_idle_free() 727 */ 728 front = 1; 729 if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 && 730 ip->i_mode && ip->i_nlink <= 0) { 731 /* 732 * Mark the i_flag to indicate that inode is being deleted. 733 * This flag will be cleared when the deletion is complete. 734 * This prevents nfs from sneaking in via ufs_vget() while 735 * the delete is in progress (bugid 1242481). 736 */ 737 ip->i_flag |= IDEL; 738 739 /* 740 * NOIDEL means that deletes are not allowed at this time; 741 * whoever resets NOIDEL will also send this inode back 742 * through ufs_iinactive. IREF remains set. 743 */ 744 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 745 mutex_enter(&vp->v_lock); 746 vp->v_count--; 747 mutex_exit(&vp->v_lock); 748 rw_exit(&ip->i_contents); 749 return; 750 } 751 if (!TRANS_ISTRANS(ip->i_ufsvfs)) { 752 rw_exit(&ip->i_contents); 753 ufs_delete(ip->i_ufsvfs, ip, 0); 754 return; 755 } 756 757 /* queue to delete thread; IREF remains set */ 758 ins.in_qfree.value.ul++; 759 uq = &ip->i_ufsvfs->vfs_delete; 760 761 mutex_enter(&uq->uq_mutex); 762 763 /* add to q */ 764 if ((iq = uq->uq_ihead) != 0) { 765 ip->i_freef = iq; 766 ip->i_freeb = iq->i_freeb; 767 iq->i_freeb->i_freef = ip; 768 iq->i_freeb = ip; 769 if (front) 770 uq->uq_ihead = ip; 771 } else { 772 uq->uq_ihead = ip; 773 ip->i_freef = ip; 774 ip->i_freeb = ip; 775 } 776 777 delq_info->delq_unreclaimed_files += 1; 778 delq_info->delq_unreclaimed_blocks += ip->i_blocks; 779 } else { 780 /* 781 * queue to idle thread 782 * Check the v_count == 1 again. 783 * 784 */ 785 mutex_enter(&vp->v_lock); 786 if (vp->v_count > 1) { 787 vp->v_count--; /* release our hold from vn_rele */ 788 mutex_exit(&vp->v_lock); 789 rw_exit(&ip->i_contents); 790 return; 791 } 792 mutex_exit(&vp->v_lock); 793 uq = &ufs_idle_q; 794 795 /* 796 * useful iff it has pages or is a fastsymlink; otherwise junk 797 */ 798 mutex_enter(&uq->uq_mutex); 799 800 /* clear IREF means `on idle list' */ 801 ip->i_flag &= ~(IREF | IDIRECTIO); 802 803 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) { 804 ins.in_frback.value.ul++; 805 hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)]; 806 ufs_nuseful_iq++; 807 } else { 808 ins.in_frfront.value.ul++; 809 hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)]; 810 ip->i_flag |= IJUNKIQ; 811 ufs_njunk_iq++; 812 } 813 ip->i_freef = hip; 814 ip->i_freeb = hip->i_freeb; 815 hip->i_freeb->i_freef = ip; 816 hip->i_freeb = ip; 817 } 818 819 /* wakeup thread(s) if q is overfull */ 820 if (++uq->uq_ne == uq->uq_lowat) 821 cv_broadcast(&uq->uq_cv); 822 823 /* all done, release the q and inode */ 824 mutex_exit(&uq->uq_mutex); 825 rw_exit(&ip->i_contents); 826 } 827 828 /* 829 * Check accessed and update flags on an inode structure. 830 * If any are on, update the inode with the (unique) current time. 831 * If waitfor is given, insure I/O order so wait for write to complete. 832 */ 833 void 834 ufs_iupdat(struct inode *ip, int waitfor) 835 { 836 struct buf *bp; 837 struct fs *fp; 838 struct dinode *dp; 839 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 840 int i; 841 int do_trans_times; 842 ushort_t flag; 843 o_uid_t suid; 844 o_gid_t sgid; 845 846 /* 847 * This function is now safe to be called with either the reader 848 * or writer i_contents lock. 849 */ 850 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 851 852 /* 853 * Return if file system has been forcibly umounted. 854 */ 855 if (ufsvfsp == NULL) 856 return; 857 858 flag = ip->i_flag; /* Atomic read */ 859 /* 860 * We better not update the disk inode from a stale inode. 861 */ 862 if (flag & ISTALE) 863 return; 864 865 fp = ip->i_fs; 866 867 if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) { 868 if (fp->fs_ronly) { 869 mutex_enter(&ip->i_tlock); 870 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 871 mutex_exit(&ip->i_tlock); 872 return; 873 } 874 /* 875 * fs is active while metadata is being written 876 */ 877 mutex_enter(&ufsvfsp->vfs_lock); 878 ufs_notclean(ufsvfsp); 879 /* 880 * get the dinode 881 */ 882 bp = UFS_BREAD(ufsvfsp, ip->i_dev, 883 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)), 884 (int)fp->fs_bsize); 885 if (bp->b_flags & B_ERROR) { 886 mutex_enter(&ip->i_tlock); 887 ip->i_flag &= 888 ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 889 mutex_exit(&ip->i_tlock); 890 brelse(bp); 891 return; 892 } 893 /* 894 * munge inode fields 895 */ 896 mutex_enter(&ip->i_tlock); 897 ITIMES_NOLOCK(ip); 898 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC); 899 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 900 mutex_exit(&ip->i_tlock); 901 902 /* 903 * For reads and concurrent re-writes, no deltas were 904 * entered for the access time changes - do it now. 905 */ 906 if (do_trans_times) { 907 TRANS_INODE_TIMES(ufsvfsp, ip); 908 } 909 910 /* 911 * For SunOS 5.0->5.4, these lines below read: 912 * 913 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 914 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 915 * 916 * where MAXUID was set to 60002. This was incorrect - 917 * the uids should have been constrained to what fitted into 918 * a 16-bit word. 919 * 920 * This means that files from 4.x filesystems that have an 921 * i_suid field larger than 60002 will have that field 922 * changed to 65535. 923 * 924 * Security note: 4.x UFS could never create a i_suid of 925 * UID_LONG since that would've corresponded to -1. 926 */ 927 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 928 UID_LONG : ip->i_uid; 929 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 930 GID_LONG : ip->i_gid; 931 932 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) { 933 ip->i_suid = suid; 934 ip->i_sgid = sgid; 935 TRANS_INODE(ufsvfsp, ip); 936 } 937 938 if ((ip->i_mode & IFMT) == IFBLK || 939 (ip->i_mode & IFMT) == IFCHR) { 940 dev_t d = ip->i_rdev; 941 dev32_t dev32; 942 943 /* 944 * load first direct block only if special device 945 */ 946 if (!cmpldev(&dev32, d)) { 947 /* 948 * We panic here because there's "no way" 949 * we should have been able to create a large 950 * inode with a large dev_t. Earlier layers 951 * should've caught this. 952 */ 953 panic("ip %p: i_rdev too big", (void *)ip); 954 } 955 956 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 957 ip->i_ordev = dev32; /* can't use old fmt. */ 958 } else { 959 ip->i_ordev = cmpdev(d); 960 } 961 } 962 963 /* 964 * copy inode to dinode (zero fastsymlnk in dinode) 965 */ 966 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number); 967 dp->di_ic = ip->i_ic; /* structure assignment */ 968 if (flag & IFASTSYMLNK) { 969 for (i = 1; i < NDADDR; i++) 970 dp->di_db[i] = 0; 971 for (i = 0; i < NIADDR; i++) 972 dp->di_ib[i] = 0; 973 } 974 if (TRANS_ISTRANS(ufsvfsp)) { 975 /* 976 * Pass only a sector size buffer containing 977 * the inode, otherwise when the buffer is copied 978 * into a cached roll buffer then too much memory 979 * gets consumed if 8KB inode buffers are passed. 980 */ 981 TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff, 982 sizeof (struct dinode), 983 (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE), 984 DEV_BSIZE); 985 986 brelse(bp); 987 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) { 988 UFS_BRWRITE(ufsvfsp, bp); 989 990 /* 991 * Synchronous write has guaranteed that inode 992 * has been written on disk so clear the flag 993 */ 994 mutex_enter(&ip->i_tlock); 995 ip->i_flag &= ~IBDWRITE; 996 mutex_exit(&ip->i_tlock); 997 } else { 998 bdrwrite(bp); 999 1000 /* 1001 * This write hasn't guaranteed that inode has been 1002 * written on the disk. 1003 * Since, all updat flags on inode are cleared, we must 1004 * remember the condition in case inode is to be updated 1005 * synchronously later (e.g.- fsync()/fdatasync()) 1006 * and inode has not been modified yet. 1007 */ 1008 mutex_enter(&ip->i_tlock); 1009 ip->i_flag |= IBDWRITE; 1010 mutex_exit(&ip->i_tlock); 1011 } 1012 } else { 1013 /* 1014 * In case previous inode update was done asynchronously 1015 * (IBDWRITE) and this inode update request wants guaranteed 1016 * (synchronous) disk update, flush the inode. 1017 */ 1018 if (waitfor && (flag & IBDWRITE)) { 1019 blkflush(ip->i_dev, 1020 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number))); 1021 mutex_enter(&ip->i_tlock); 1022 ip->i_flag &= ~IBDWRITE; 1023 mutex_exit(&ip->i_tlock); 1024 } 1025 } 1026 } 1027 1028 #define SINGLE 0 /* index of single indirect block */ 1029 #define DOUBLE 1 /* index of double indirect block */ 1030 #define TRIPLE 2 /* index of triple indirect block */ 1031 1032 /* 1033 * Release blocks associated with the inode ip and 1034 * stored in the indirect block bn. Blocks are free'd 1035 * in LIFO order up to (but not including) lastbn. If 1036 * level is greater than SINGLE, the block is an indirect 1037 * block and recursive calls to indirtrunc must be used to 1038 * cleanse other indirect blocks. 1039 * 1040 * N.B.: triple indirect blocks are untested. 1041 */ 1042 static long 1043 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags) 1044 { 1045 int i; 1046 struct buf *bp, *copy; 1047 daddr32_t *bap; 1048 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1049 struct fs *fs = ufsvfsp->vfs_fs; 1050 daddr_t nb, last; 1051 long factor; 1052 int blocksreleased = 0, nblocks; 1053 1054 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 1055 /* 1056 * Calculate index in current block of last 1057 * block to be kept. -1 indicates the entire 1058 * block so we need not calculate the index. 1059 */ 1060 factor = 1; 1061 for (i = SINGLE; i < level; i++) 1062 factor *= NINDIR(fs); 1063 last = lastbn; 1064 if (lastbn > 0) 1065 last /= factor; 1066 nblocks = btodb(fs->fs_bsize); 1067 /* 1068 * Get buffer of block pointers, zero those 1069 * entries corresponding to blocks to be free'd, 1070 * and update on disk copy first. 1071 * *Unless* the root pointer has been synchronously 1072 * written to disk. If nothing points to this 1073 * indirect block then don't bother zero'ing and 1074 * writing it. 1075 */ 1076 bp = UFS_BREAD(ufsvfsp, 1077 ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize); 1078 if (bp->b_flags & B_ERROR) { 1079 brelse(bp); 1080 return (0); 1081 } 1082 bap = bp->b_un.b_daddr; 1083 if ((flags & I_CHEAP) == 0) { 1084 uint_t zb; 1085 1086 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t)); 1087 1088 if (zb) { 1089 /* 1090 * push any data into the log before we zero it 1091 */ 1092 if (bp->b_flags & B_DELWRI) 1093 TRANS_LOG(ufsvfsp, (caddr_t)bap, 1094 ldbtob(bp->b_blkno), bp->b_bcount, 1095 bp->b_un.b_addr, bp->b_bcount); 1096 copy = ngeteblk(fs->fs_bsize); 1097 bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr, 1098 (uint_t)fs->fs_bsize); 1099 bzero((caddr_t)&bap[last + 1], zb); 1100 1101 TRANS_BUF(ufsvfsp, 1102 (caddr_t)&bap[last + 1] - (caddr_t)bap, 1103 zb, bp, DT_ABZERO); 1104 1105 UFS_BRWRITE(ufsvfsp, bp); 1106 bp = copy, bap = bp->b_un.b_daddr; 1107 } 1108 } else { 1109 /* make sure write retries are also cleared */ 1110 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI); 1111 bp->b_flags |= B_STALE | B_AGE; 1112 } 1113 1114 /* 1115 * Recursively free totally unused blocks. 1116 */ 1117 flags |= I_CHEAP; 1118 for (i = NINDIR(fs) - 1; i > last; i--) { 1119 nb = bap[i]; 1120 if (nb == 0) 1121 continue; 1122 if (level > SINGLE) { 1123 blocksreleased += 1124 indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags); 1125 free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK); 1126 } else 1127 free(ip, nb, (off_t)fs->fs_bsize, flags); 1128 blocksreleased += nblocks; 1129 } 1130 flags &= ~I_CHEAP; 1131 1132 /* 1133 * Recursively free last partial block. 1134 */ 1135 if (level > SINGLE && lastbn >= 0) { 1136 last = lastbn % factor; 1137 nb = bap[i]; 1138 if (nb != 0) 1139 blocksreleased += indirtrunc(ip, nb, last, level - 1, 1140 flags); 1141 } 1142 brelse(bp); 1143 return (blocksreleased); 1144 } 1145 1146 /* 1147 * Truncate the inode ip to at most length size. 1148 * Free affected disk blocks -- the blocks of the 1149 * file are removed in reverse order. 1150 * 1151 * N.B.: triple indirect blocks are untested. 1152 */ 1153 static int i_genrand = 1234; 1154 int 1155 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr) 1156 { 1157 struct fs *fs = oip->i_fs; 1158 struct ufsvfs *ufsvfsp = oip->i_ufsvfs; 1159 struct inode *ip; 1160 daddr_t lastblock; 1161 off_t bsize; 1162 int boff; 1163 daddr_t bn, lastiblock[NIADDR]; 1164 int level; 1165 long nblocks, blocksreleased = 0; 1166 int i; 1167 ushort_t mode; 1168 struct inode tip; 1169 int err; 1170 u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ? 1171 (UFS_MAXOFFSET_T) : (MAXOFF32_T); 1172 1173 /* 1174 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most 1175 * other uses need the reader lock. opendq() holds the writer lock. 1176 */ 1177 ASSERT((oip->i_mode & IFMT) == IFSHAD || 1178 RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 1179 ASSERT(RW_WRITE_HELD(&oip->i_contents)); 1180 /* 1181 * We only allow truncation of regular files and directories 1182 * to arbitrary lengths here. In addition, we allow symbolic 1183 * links to be truncated only to zero length. Other inode 1184 * types cannot have their length set here. Disk blocks are 1185 * being dealt with - especially device inodes where 1186 * ip->i_ordev is actually being stored in ip->i_db[0]! 1187 */ 1188 TRANS_INODE(ufsvfsp, oip); 1189 mode = oip->i_mode & IFMT; 1190 if (flags & I_FREE) { 1191 i_genrand *= 16843009; /* turns into shift and adds */ 1192 i_genrand++; 1193 oip->i_gen += ((i_genrand + lbolt) & 0xffff) + 1; 1194 oip->i_flag |= ICHG |IUPD; 1195 oip->i_seq++; 1196 if (length == oip->i_size) 1197 return (0); 1198 flags |= I_CHEAP; 1199 } 1200 if (mode == IFIFO) 1201 return (0); 1202 if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR && 1203 !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD) 1204 return (EINVAL); 1205 if (length > maxoffset) 1206 return (EFBIG); 1207 if ((mode == IFDIR) || (mode == IFATTRDIR)) 1208 flags |= I_DIR; 1209 if (mode == IFSHAD) 1210 flags |= I_SHAD; 1211 if (oip == ufsvfsp->vfs_qinod) 1212 flags |= I_QUOTA; 1213 if (length == oip->i_size) { 1214 /* update ctime and mtime to please POSIX tests */ 1215 oip->i_flag |= ICHG |IUPD; 1216 oip->i_seq++; 1217 if (length == 0) { 1218 /* nothing to cache so clear the flag */ 1219 oip->i_flag &= ~IFASTSYMLNK; 1220 } 1221 return (0); 1222 } 1223 /* wipe out fast symlink till next access */ 1224 if (oip->i_flag & IFASTSYMLNK) { 1225 int j; 1226 1227 ASSERT(ITOV(oip)->v_type == VLNK); 1228 1229 oip->i_flag &= ~IFASTSYMLNK; 1230 1231 for (j = 1; j < NDADDR; j++) 1232 oip->i_db[j] = 0; 1233 for (j = 0; j < NIADDR; j++) 1234 oip->i_ib[j] = 0; 1235 } 1236 1237 boff = (int)blkoff(fs, length); 1238 1239 if (length > oip->i_size) { 1240 /* 1241 * Trunc up case. BMAPALLOC will insure that the right blocks 1242 * are allocated. This includes extending the old frag to a 1243 * full block (if needed) in addition to doing any work 1244 * needed for allocating the last block. 1245 */ 1246 if (boff == 0) 1247 err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr); 1248 else 1249 err = BMAPALLOC(oip, length - 1, boff, cr); 1250 1251 if (err == 0) { 1252 /* 1253 * Save old size and set inode's size now 1254 * so that we don't cause too much of the 1255 * file to be zero'd and pushed. 1256 */ 1257 u_offset_t osize = oip->i_size; 1258 oip->i_size = length; 1259 /* 1260 * Make sure we zero out the remaining bytes of 1261 * the page in case a mmap scribbled on it. We 1262 * can't prevent a mmap from writing beyond EOF 1263 * on the last page of a file. 1264 * 1265 */ 1266 if ((boff = (int)blkoff(fs, osize)) != 0) { 1267 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ? 1268 fs->fs_bsize : fragroundup(fs, boff); 1269 pvn_vpzero(ITOV(oip), osize, 1270 (size_t)(bsize - boff)); 1271 } 1272 oip->i_flag |= ICHG|IATTCHG; 1273 oip->i_seq++; 1274 ITIMES_NOLOCK(oip); 1275 /* 1276 * MAXOFF32_T is old 2GB size limit. If 1277 * this operation caused a large file to be 1278 * created, turn on the superblock flag 1279 * and update the superblock, if the flag 1280 * is not already on. 1281 */ 1282 if ((length > (u_offset_t)MAXOFF32_T) && 1283 !(fs->fs_flags & FSLARGEFILES)) { 1284 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1285 mutex_enter(&ufsvfsp->vfs_lock); 1286 fs->fs_flags |= FSLARGEFILES; 1287 ufs_sbwrite(ufsvfsp); 1288 mutex_exit(&ufsvfsp->vfs_lock); 1289 } 1290 } 1291 1292 return (err); 1293 } 1294 1295 /* 1296 * Update the pages of the file. If the file is not being 1297 * truncated to a block boundary, the contents of the 1298 * pages following the end of the file must be zero'ed 1299 * in case it ever become accessible again because 1300 * of subsequent file growth. 1301 */ 1302 if (boff == 0) { 1303 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage, 1304 B_INVAL | B_TRUNC, CRED()); 1305 } else { 1306 /* 1307 * Make sure that the last block is properly allocated. 1308 * We only really have to do this if the last block is 1309 * actually allocated since ufs_bmap will now handle the case 1310 * of an fragment which has no block allocated. Just to 1311 * be sure, we do it now independent of current allocation. 1312 */ 1313 err = BMAPALLOC(oip, length - 1, boff, cr); 1314 if (err) 1315 return (err); 1316 1317 /* 1318 * BMAPALLOC will call bmap_write which defers i_seq 1319 * processing. If the timestamps were changed, update 1320 * i_seq before rdip drops i_contents or syncs the inode. 1321 */ 1322 if (oip->i_flag & (ICHG|IUPD)) 1323 oip->i_seq++; 1324 1325 /* 1326 * BugId 4069932 1327 * Make sure that the relevant partial page appears in 1328 * the v_pages list, so that pvn_vpzero() will do its 1329 * job. Since doing this correctly requires everything 1330 * in rdip() except for the uiomove(), it's easier and 1331 * safer to do the uiomove() rather than duplicate the 1332 * rest of rdip() here. 1333 * 1334 * To get here, we know that length indicates a byte 1335 * that is not the first byte of a block. (length - 1) 1336 * is the last actual byte known to exist. Deduction 1337 * shows it is in the same block as byte (length). 1338 * Thus, this rdip() invocation should always succeed 1339 * except in the face of i/o errors, and give us the 1340 * block we care about. 1341 * 1342 * rdip() makes the same locking assertions and 1343 * assumptions as we do. We do not acquire any locks 1344 * before calling it, so we have not changed the locking 1345 * situation. Finally, there do not appear to be any 1346 * paths whereby rdip() ends up invoking us again. 1347 * Thus, infinite recursion is avoided. 1348 */ 1349 { 1350 uio_t uio; 1351 iovec_t iov[1]; 1352 char buffer; 1353 1354 uio.uio_iov = iov; 1355 uio.uio_iovcnt = 1; 1356 uio.uio_loffset = length - 1; 1357 uio.uio_resid = 1; 1358 uio.uio_segflg = UIO_SYSSPACE; 1359 uio.uio_extflg = UIO_COPY_CACHED; 1360 1361 iov[0].iov_base = &buffer; 1362 iov[0].iov_len = 1; 1363 1364 err = rdip(oip, &uio, UIO_READ, NULL); 1365 if (err) 1366 return (err); 1367 } 1368 1369 bsize = (int)lblkno(fs, length - 1) >= NDADDR ? 1370 fs->fs_bsize : fragroundup(fs, boff); 1371 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff)); 1372 /* 1373 * Ensure full fs block is marked as dirty. 1374 */ 1375 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff), 1376 ufs_putapage, B_INVAL | B_TRUNC, CRED()); 1377 } 1378 1379 /* 1380 * Calculate index into inode's block list of 1381 * last direct and indirect blocks (if any) 1382 * which we want to keep. Lastblock is -1 when 1383 * the file is truncated to 0. 1384 */ 1385 lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; 1386 lastiblock[SINGLE] = lastblock - NDADDR; 1387 lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); 1388 lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); 1389 nblocks = btodb(fs->fs_bsize); 1390 1391 /* 1392 * Update file and block pointers 1393 * on disk before we start freeing blocks. 1394 * If we crash before free'ing blocks below, 1395 * the blocks will be returned to the free list. 1396 * lastiblock values are also normalized to -1 1397 * for calls to indirtrunc below. 1398 */ 1399 tip = *oip; /* structure copy */ 1400 ip = &tip; 1401 1402 for (level = TRIPLE; level >= SINGLE; level--) 1403 if (lastiblock[level] < 0) { 1404 oip->i_ib[level] = 0; 1405 lastiblock[level] = -1; 1406 } 1407 for (i = NDADDR - 1; i > lastblock; i--) { 1408 oip->i_db[i] = 0; 1409 flags |= I_CHEAP; 1410 } 1411 oip->i_size = length; 1412 oip->i_flag |= ICHG|IUPD|IATTCHG; 1413 oip->i_seq++; 1414 if (!TRANS_ISTRANS(ufsvfsp)) 1415 ufs_iupdat(oip, I_SYNC); /* do sync inode update */ 1416 1417 /* 1418 * Indirect blocks first. 1419 */ 1420 for (level = TRIPLE; level >= SINGLE; level--) { 1421 bn = ip->i_ib[level]; 1422 if (bn != 0) { 1423 blocksreleased += 1424 indirtrunc(ip, bn, lastiblock[level], level, flags); 1425 if (lastiblock[level] < 0) { 1426 ip->i_ib[level] = 0; 1427 free(ip, bn, (off_t)fs->fs_bsize, 1428 flags | I_IBLK); 1429 blocksreleased += nblocks; 1430 } 1431 } 1432 if (lastiblock[level] >= 0) 1433 goto done; 1434 } 1435 1436 /* 1437 * All whole direct blocks or frags. 1438 */ 1439 for (i = NDADDR - 1; i > lastblock; i--) { 1440 bn = ip->i_db[i]; 1441 if (bn == 0) 1442 continue; 1443 ip->i_db[i] = 0; 1444 bsize = (off_t)blksize(fs, ip, i); 1445 free(ip, bn, bsize, flags); 1446 blocksreleased += btodb(bsize); 1447 } 1448 if (lastblock < 0) 1449 goto done; 1450 1451 /* 1452 * Finally, look for a change in size of the 1453 * last direct block; release any frags. 1454 */ 1455 bn = ip->i_db[lastblock]; 1456 if (bn != 0) { 1457 off_t oldspace, newspace; 1458 1459 /* 1460 * Calculate amount of space we're giving 1461 * back as old block size minus new block size. 1462 */ 1463 oldspace = blksize(fs, ip, lastblock); 1464 UFS_SET_ISIZE(length, ip); 1465 newspace = blksize(fs, ip, lastblock); 1466 if (newspace == 0) { 1467 err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0"); 1468 return (err); 1469 } 1470 if (oldspace - newspace > 0) { 1471 /* 1472 * Block number of space to be free'd is 1473 * the old block # plus the number of frags 1474 * required for the storage we're keeping. 1475 */ 1476 bn += numfrags(fs, newspace); 1477 free(ip, bn, oldspace - newspace, flags); 1478 blocksreleased += btodb(oldspace - newspace); 1479 } 1480 } 1481 done: 1482 /* BEGIN PARANOIA */ 1483 for (level = SINGLE; level <= TRIPLE; level++) 1484 if (ip->i_ib[level] != oip->i_ib[level]) { 1485 err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block"); 1486 return (err); 1487 } 1488 1489 for (i = 0; i < NDADDR; i++) 1490 if (ip->i_db[i] != oip->i_db[i]) { 1491 err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block"); 1492 return (err); 1493 } 1494 /* END PARANOIA */ 1495 oip->i_blocks -= blocksreleased; 1496 1497 if (oip->i_blocks < 0) { /* sanity */ 1498 cmn_err(CE_NOTE, 1499 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n", 1500 fs->fs_fsmnt, (int)oip->i_number, oip->i_size, 1501 (int)oip->i_blocks); 1502 oip->i_blocks = 0; 1503 } 1504 oip->i_flag |= ICHG|IATTCHG; 1505 oip->i_seq++; 1506 /* blocksreleased is >= zero, so this can not fail */ 1507 (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL, 1508 (size_t *)NULL); 1509 return (0); 1510 } 1511 1512 /* 1513 * Check mode permission on inode. Mode is READ, WRITE or EXEC. 1514 * In the case of WRITE, the read-only status of the file system 1515 * is checked. Depending on the calling user, the appropriate 1516 * mode bits are selected; privileges to override missing permission 1517 * bits are checked through secpolicy_vnode_access(). 1518 */ 1519 int 1520 ufs_iaccess(void *vip, int mode, struct cred *cr) 1521 { 1522 struct inode *ip = vip; 1523 int shift = 0; 1524 1525 if (mode & IWRITE) { 1526 /* 1527 * Disallow write attempts on read-only 1528 * file systems, unless the file is a block 1529 * or character device or a FIFO. 1530 */ 1531 if (ip->i_fs->fs_ronly != 0) { 1532 if ((ip->i_mode & IFMT) != IFCHR && 1533 (ip->i_mode & IFMT) != IFBLK && 1534 (ip->i_mode & IFMT) != IFIFO) { 1535 return (EROFS); 1536 } 1537 } 1538 } 1539 /* 1540 * If there is a shadow inode check for the presence of an acl, 1541 * if the acl is there use the ufs_acl_access routine to check 1542 * the acl 1543 */ 1544 if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) 1545 return (ufs_acl_access(ip, mode, cr)); 1546 1547 /* 1548 * Access check is based on only 1549 * one of owner, group, public. 1550 * If not owner, then check group. 1551 * If not a member of the group, then 1552 * check public access. 1553 */ 1554 if (crgetuid(cr) != ip->i_uid) { 1555 shift += 3; 1556 if (!groupmember((uid_t)ip->i_gid, cr)) 1557 shift += 3; 1558 } 1559 1560 mode &= ~(ip->i_mode << shift); 1561 1562 if (mode == 0) 1563 return (0); 1564 1565 /* test missing privilege bits */ 1566 return (secpolicy_vnode_access(cr, ITOV(ip), ip->i_uid, mode)); 1567 } 1568 1569 /* 1570 * if necessary, remove an inode from the free list 1571 * i_contents is held except at unmount 1572 * 1573 * Return 1 if the inode is taken off of the ufs_idle_q, 1574 * and the caller is expected to call VN_RELE. 1575 * 1576 * Return 0 otherwise. 1577 */ 1578 int 1579 ufs_rmidle(struct inode *ip) 1580 { 1581 int rval = 0; 1582 1583 mutex_enter(&ip->i_tlock); 1584 if ((ip->i_flag & IREF) == 0) { 1585 mutex_enter(&ufs_idle_q.uq_mutex); 1586 ip->i_freef->i_freeb = ip->i_freeb; 1587 ip->i_freeb->i_freef = ip->i_freef; 1588 ip->i_freef = ip; 1589 ip->i_freeb = ip; 1590 ip->i_flag |= IREF; 1591 ufs_idle_q.uq_ne--; 1592 if (ip->i_flag & IJUNKIQ) { 1593 ufs_njunk_iq--; 1594 ip->i_flag &= ~IJUNKIQ; 1595 } else { 1596 ufs_nuseful_iq--; 1597 } 1598 mutex_exit(&ufs_idle_q.uq_mutex); 1599 rval = 1; 1600 } 1601 mutex_exit(&ip->i_tlock); 1602 return (rval); 1603 } 1604 1605 /* 1606 * scan the hash of inodes and call func with the inode locked 1607 */ 1608 int 1609 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg, 1610 struct ufsvfs *ufsvfsp) 1611 { 1612 struct inode *ip; /* current inode */ 1613 struct inode *lip = NULL; /* last/previous inode */ 1614 union ihead *ih; /* current hash chain */ 1615 int error, i; 1616 int saverror = 0; 1617 int lip_held; /* lip needs a VN_RELE() */ 1618 1619 /* 1620 * If ufsvfsp is NULL, then our caller should be holding 1621 * ufs_scan_lock to avoid conflicts between ufs_unmount() and 1622 * ufs_update(). Otherwise, to avoid false-positives in 1623 * ufs_unmount()'s v_count-based EBUSY check, we only hold 1624 * those inodes that are in the file system our caller cares 1625 * about. 1626 * 1627 * We know that ip is a valid inode in the hash chain (and thus 1628 * we can trust i_ufsvfs) because the inode we chained from 1629 * (lip) is still in the hash chain. This is true because either: 1630 * 1631 * 1. We did not drop the hash chain lock since the last 1632 * iteration (because we were not interested in the last inode), 1633 * or 1634 * 2. We maintained a hold on the last inode while we 1635 * we were processing it, so it could not be removed 1636 * from the hash chain. 1637 * 1638 * The whole reason we're dropping and re-grabbing the chain 1639 * lock on every inode is so that we don't present a major 1640 * choke point on throughput, particularly when we've been 1641 * called on behalf of fsflush. 1642 */ 1643 1644 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 1645 mutex_enter(&ih_lock[i]); 1646 for (ip = ih->ih_chain[0], lip_held = 0; 1647 ip != (struct inode *)ih; 1648 ip = lip->i_forw) { 1649 1650 ins.in_scan.value.ul++; 1651 1652 /* 1653 * Undo the previous iteration's VN_HOLD(), but 1654 * only if one was done. 1655 */ 1656 if (lip_held) 1657 VN_RELE(ITOV(lip)); 1658 1659 lip = ip; 1660 if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) { 1661 /* 1662 * We're not processing all inodes, and 1663 * this inode is not in the filesystem of 1664 * interest, so skip it. No need to do a 1665 * VN_HOLD() since we're not dropping the 1666 * hash chain lock until after we've 1667 * done the i_forw traversal above. 1668 */ 1669 lip_held = 0; 1670 continue; 1671 } 1672 VN_HOLD(ITOV(ip)); 1673 lip_held = 1; 1674 mutex_exit(&ih_lock[i]); 1675 1676 /* 1677 * Acquire the contents lock as writer to make 1678 * sure that the inode has been initialized in 1679 * the cache or removed from the idle list by 1680 * ufs_iget(). This works because ufs_iget() 1681 * acquires the contents lock before putting 1682 * the inode into the cache. If we can lock 1683 * it, then he's done with it. 1684 */ 1685 1686 if (rwtry) { 1687 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) { 1688 mutex_enter(&ih_lock[i]); 1689 continue; 1690 } 1691 } else { 1692 rw_enter(&ip->i_contents, RW_WRITER); 1693 } 1694 1695 rw_exit(&ip->i_contents); 1696 1697 /* 1698 * ISTALE means the inode couldn't be read 1699 * 1700 * We don't have to hold the i_contents lock 1701 * for this check for a couple of 1702 * reasons. First, if ISTALE is set then the 1703 * flag cannot be cleared until the inode is 1704 * removed from the cache and that cannot 1705 * happen until after we VN_RELE() it. 1706 * Second, if ISTALE is not set, then the 1707 * inode is in the cache and does not need to 1708 * be read from disk so ISTALE cannot be set 1709 * while we are not looking. 1710 */ 1711 if ((ip->i_flag & ISTALE) == 0) { 1712 if ((error = (*func)(ip, arg)) != 0) 1713 saverror = error; 1714 } 1715 1716 mutex_enter(&ih_lock[i]); 1717 } 1718 if (lip_held) 1719 VN_RELE(ITOV(lip)); 1720 mutex_exit(&ih_lock[i]); 1721 } 1722 return (saverror); 1723 } 1724 1725 /* 1726 * Mark inode with the current time, plus a unique increment. 1727 * 1728 * Since we only keep 32-bit time on disk, if UFS is still alive 1729 * beyond 2038, filesystem times will simply stick at the last 1730 * possible second of 32-bit time. Not ideal, but probably better 1731 * than going into the remote past, or confusing applications with 1732 * negative time. 1733 */ 1734 void 1735 ufs_imark(struct inode *ip) 1736 { 1737 timestruc_t now; 1738 int32_t usec, nsec; 1739 1740 /* 1741 * The update of i_seq may have been deferred, increase i_seq here 1742 * to make sure it is in sync with the timestamps. 1743 */ 1744 if (ip->i_flag & ISEQ) { 1745 ASSERT(ip->i_flag & (IUPD|ICHG)); 1746 ip->i_seq++; 1747 ip->i_flag &= ~ISEQ; 1748 } 1749 1750 gethrestime(&now); 1751 1752 /* 1753 * Fast algorithm to convert nsec to usec -- see hrt2ts() 1754 * in common/os/timers.c for a full description. 1755 */ 1756 nsec = now.tv_nsec; 1757 usec = nsec + (nsec >> 2); 1758 usec = nsec + (usec >> 1); 1759 usec = nsec + (usec >> 2); 1760 usec = nsec + (usec >> 4); 1761 usec = nsec - (usec >> 3); 1762 usec = nsec + (usec >> 2); 1763 usec = nsec + (usec >> 3); 1764 usec = nsec + (usec >> 4); 1765 usec = nsec + (usec >> 1); 1766 usec = nsec + (usec >> 6); 1767 usec = usec >> 10; 1768 1769 mutex_enter(&ufs_iuniqtime_lock); 1770 if (now.tv_sec > (time_t)iuniqtime.tv_sec || 1771 usec > iuniqtime.tv_usec) { 1772 if (now.tv_sec < TIME32_MAX) { 1773 iuniqtime.tv_sec = (time32_t)now.tv_sec; 1774 iuniqtime.tv_usec = usec; 1775 } 1776 } else { 1777 if (iuniqtime.tv_sec < TIME32_MAX) { 1778 iuniqtime.tv_usec++; 1779 /* Check for usec overflow */ 1780 if (iuniqtime.tv_usec >= MICROSEC) { 1781 iuniqtime.tv_sec++; 1782 iuniqtime.tv_usec = 0; 1783 } 1784 } 1785 } 1786 1787 if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) { 1788 ip->i_atime = iuniqtime; 1789 } 1790 if (ip->i_flag & IUPD) { 1791 ip->i_mtime = iuniqtime; 1792 ip->i_flag |= IMODTIME; 1793 } 1794 if (ip->i_flag & ICHG) { 1795 ip->i_diroff = 0; 1796 ip->i_ctime = iuniqtime; 1797 } 1798 mutex_exit(&ufs_iuniqtime_lock); 1799 } 1800 1801 /* 1802 * Update timestamps in inode. 1803 */ 1804 void 1805 ufs_itimes_nolock(struct inode *ip) 1806 { 1807 1808 /* 1809 * if noatime is set and the inode access time is the only field that 1810 * must be changed, exit immediately. 1811 */ 1812 if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) && 1813 (ip->i_ufsvfs->vfs_noatime)) { 1814 return; 1815 } 1816 1817 if (ip->i_flag & (IUPD|IACC|ICHG)) { 1818 if (ip->i_flag & ICHG) 1819 ip->i_flag |= IMOD; 1820 else 1821 ip->i_flag |= IMODACC; 1822 ufs_imark(ip); 1823 ip->i_flag &= ~(IACC|IUPD|ICHG); 1824 } 1825 } 1826