1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * University Copyright- Copyright (c) 1982, 1986, 1988 30 * The Regents of the University of California 31 * All Rights Reserved 32 * 33 * University Acknowledgment- Portions of this document are derived from 34 * software developed by the University of California, Berkeley, and its 35 * contributors. 36 */ 37 38 #include <sys/types.h> 39 #include <sys/t_lock.h> 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/uio.h> 43 #include <sys/bitmap.h> 44 #include <sys/signal.h> 45 #include <sys/cred.h> 46 #include <sys/user.h> 47 #include <sys/vfs.h> 48 #include <sys/stat.h> 49 #include <sys/vnode.h> 50 #include <sys/buf.h> 51 #include <sys/proc.h> 52 #include <sys/disp.h> 53 #include <sys/dnlc.h> 54 #include <sys/mode.h> 55 #include <sys/cmn_err.h> 56 #include <sys/kstat.h> 57 #include <sys/acl.h> 58 #include <sys/var.h> 59 #include <sys/fs/ufs_inode.h> 60 #include <sys/fs/ufs_fs.h> 61 #include <sys/fs/ufs_trans.h> 62 #include <sys/fs/ufs_acl.h> 63 #include <sys/fs/ufs_bio.h> 64 #include <sys/fs/ufs_quota.h> 65 #include <sys/fs/ufs_log.h> 66 #include <vm/hat.h> 67 #include <vm/as.h> 68 #include <vm/pvn.h> 69 #include <vm/seg.h> 70 #include <sys/swap.h> 71 #include <sys/cpuvar.h> 72 #include <sys/sysmacros.h> 73 #include <sys/errno.h> 74 #include <sys/kmem.h> 75 #include <sys/debug.h> 76 #include <fs/fs_subr.h> 77 #include <sys/policy.h> 78 79 struct kmem_cache *inode_cache; /* cache of free inodes */ 80 81 /* UFS Inode Cache Stats -- Not protected */ 82 struct instats ins = { 83 { "size", KSTAT_DATA_ULONG }, 84 { "maxsize", KSTAT_DATA_ULONG }, 85 { "hits", KSTAT_DATA_ULONG }, 86 { "misses", KSTAT_DATA_ULONG }, 87 { "kmem allocs", KSTAT_DATA_ULONG }, 88 { "kmem frees", KSTAT_DATA_ULONG }, 89 { "maxsize reached", KSTAT_DATA_ULONG }, 90 { "puts at frontlist", KSTAT_DATA_ULONG }, 91 { "puts at backlist", KSTAT_DATA_ULONG }, 92 { "queues to free", KSTAT_DATA_ULONG }, 93 { "scans", KSTAT_DATA_ULONG }, 94 { "thread idles", KSTAT_DATA_ULONG }, 95 { "lookup idles", KSTAT_DATA_ULONG }, 96 { "vget idles", KSTAT_DATA_ULONG }, 97 { "cache allocs", KSTAT_DATA_ULONG }, 98 { "cache frees", KSTAT_DATA_ULONG }, 99 { "pushes at close", KSTAT_DATA_ULONG } 100 }; 101 102 /* kstat data */ 103 static kstat_t *ufs_inode_kstat = NULL; 104 105 union ihead *ihead; /* inode LRU cache, Chris Maltby */ 106 kmutex_t *ih_lock; /* protect inode cache hash table */ 107 static int ino_hashlen = 4; /* desired average hash chain length */ 108 int inohsz; /* number of buckets in the hash table */ 109 110 kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */ 111 kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */ 112 kmutex_t ufsvfs_mutex; 113 struct ufsvfs *oldufsvfslist, *ufsvfslist; 114 115 /* 116 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no 117 * I/Os are going on. 118 */ 119 clock_t ufs_iowait; 120 121 /* 122 * the threads that process idle inodes and free (deleted) inodes 123 * have high water marks that are set in ufsinit(). 124 * These values but can be no less then the minimum shown below 125 */ 126 int ufs_idle_max; /* # of allowable idle inodes */ 127 ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */ 128 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */ 129 130 /* 131 * Tunables for ufs write throttling. 132 * These are validated in ufs_iinit() since improper settings 133 * can lead to filesystem hangs. 134 */ 135 #define UFS_HW_DEFAULT (16 * 1024 * 1024) 136 #define UFS_LW_DEFAULT (8 * 1024 * 1024) 137 int ufs_HW = UFS_HW_DEFAULT; 138 int ufs_LW = UFS_LW_DEFAULT; 139 140 static void ihinit(void); 141 extern int hash2ints(int, int); 142 143 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **, 144 struct cred *, int); 145 146 /* ARGSUSED */ 147 static int 148 ufs_inode_kstat_update(kstat_t *ksp, int rw) 149 { 150 if (rw == KSTAT_WRITE) 151 return (EACCES); 152 153 ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 154 "slab_alloc"); 155 ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 156 "slab_free"); 157 ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 158 "alloc"); 159 ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 160 "free"); 161 ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 162 "buf_inuse"); 163 ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 164 "buf_max"); 165 ins.in_misses.value.ul = ins.in_kcalloc.value.ul; 166 167 return (0); 168 } 169 170 void 171 ufs_iinit(void) 172 { 173 /* 174 * Validate that ufs_HW > ufs_LW. 175 * The default values for these two tunables have been increased. 176 * There is now a range of values for ufs_HW that used to be 177 * legal on previous Solaris versions but no longer is now. 178 * Upgrading a machine which has an /etc/system setting for ufs_HW 179 * from that range can lead to filesystem hangs unless the values 180 * are checked here. 181 */ 182 if (ufs_HW <= ufs_LW) { 183 cmn_err(CE_WARN, 184 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.", 185 ufs_HW, ufs_LW); 186 ufs_LW = UFS_LW_DEFAULT; 187 ufs_HW = UFS_HW_DEFAULT; 188 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n", 189 ufs_HW, ufs_LW); 190 } 191 192 /* 193 * Adjust the tunable `ufs_ninode' to a reasonable value 194 */ 195 if (ufs_ninode <= 0) 196 ufs_ninode = ncsize; 197 if (ufs_inode_max == 0) 198 ufs_inode_max = 199 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode)); 200 if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) { 201 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld", 202 ufs_inode_max); 203 ufs_ninode = ufs_inode_max; 204 } 205 /* 206 * Wait till third call of ufs_update to declare that no I/Os are 207 * going on. This allows deferred access times to be flushed to disk. 208 */ 209 ufs_iowait = v.v_autoup * hz * 2; 210 211 /* 212 * idle thread runs when 25% of ufs_ninode entries are on the queue 213 */ 214 if (ufs_idle_max == 0) 215 ufs_idle_max = ufs_ninode >> 2; 216 if (ufs_idle_max < UFS_IDLE_MAX) 217 ufs_idle_max = UFS_IDLE_MAX; 218 if (ufs_idle_max > ufs_ninode) 219 ufs_idle_max = ufs_ninode; 220 /* 221 * This is really a misnomer, it is ufs_queue_init 222 */ 223 ufs_thread_init(&ufs_idle_q, ufs_idle_max); 224 ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL); 225 226 /* 227 * global hlock thread 228 */ 229 ufs_thread_init(&ufs_hlock, 1); 230 ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL); 231 232 ihinit(); 233 qtinit(); 234 ins.in_maxsize.value.ul = ufs_ninode; 235 if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs", 236 KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t), 237 KSTAT_FLAG_VIRTUAL)) != NULL) { 238 ufs_inode_kstat->ks_data = (void *)&ins; 239 ufs_inode_kstat->ks_update = ufs_inode_kstat_update; 240 kstat_install(ufs_inode_kstat); 241 } 242 ufsfx_init(); /* fix-on-panic initialization */ 243 si_cache_init(); 244 ufs_directio_init(); 245 lufs_init(); 246 mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL); 247 } 248 249 /* ARGSUSED */ 250 static int 251 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags) 252 { 253 struct inode *ip = buf; 254 struct vnode *vp; 255 256 vp = ip->i_vnode = vn_alloc(kmflags); 257 if (vp == NULL) { 258 return (-1); 259 } 260 vn_setops(vp, ufs_vnodeops); 261 vp->v_data = ip; 262 263 rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL); 264 rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL); 265 mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL); 266 dnlc_dir_init(&ip->i_danchor); 267 268 cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL); 269 270 return (0); 271 } 272 273 /* ARGSUSED */ 274 static void 275 ufs_inode_cache_destructor(void *buf, void *cdrarg) 276 { 277 struct inode *ip = buf; 278 struct vnode *vp; 279 280 vp = ITOV(ip); 281 282 rw_destroy(&ip->i_rwlock); 283 rw_destroy(&ip->i_contents); 284 mutex_destroy(&ip->i_tlock); 285 if (vp->v_type == VDIR) { 286 dnlc_dir_fini(&ip->i_danchor); 287 } 288 289 cv_destroy(&ip->i_wrcv); 290 291 vn_free(vp); 292 } 293 294 /* 295 * Initialize hash links for inodes 296 * and build inode free list. 297 */ 298 void 299 ihinit(void) 300 { 301 int i; 302 union ihead *ih = ihead; 303 304 mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL); 305 306 inohsz = 1 << highbit(ufs_ninode / ino_hashlen); 307 ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP); 308 ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP); 309 310 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 311 ih->ih_head[0] = ih; 312 ih->ih_head[1] = ih; 313 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL); 314 } 315 inode_cache = kmem_cache_create("ufs_inode_cache", 316 sizeof (struct inode), 0, ufs_inode_cache_constructor, 317 ufs_inode_cache_destructor, ufs_inode_cache_reclaim, 318 NULL, NULL, 0); 319 } 320 321 /* 322 * Free an inode structure 323 */ 324 void 325 ufs_free_inode(struct inode *ip) 326 { 327 vn_invalid(ITOV(ip)); 328 kmem_cache_free(inode_cache, ip); 329 } 330 331 /* 332 * Allocate an inode structure 333 */ 334 struct inode * 335 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino) 336 { 337 struct inode *ip; 338 vnode_t *vp; 339 340 ip = kmem_cache_alloc(inode_cache, KM_SLEEP); 341 /* 342 * at this point we have a newly allocated inode 343 */ 344 ip->i_freef = ip; 345 ip->i_freeb = ip; 346 ip->i_flag = IREF; 347 ip->i_seq = 0xFF; /* Unique initial value */ 348 ip->i_dev = ufsvfsp->vfs_dev; 349 ip->i_ufsvfs = ufsvfsp; 350 ip->i_devvp = ufsvfsp->vfs_devvp; 351 ip->i_number = ino; 352 ip->i_diroff = 0; 353 ip->i_nextr = 0; 354 ip->i_map = NULL; 355 ip->i_rdev = 0; 356 ip->i_writes = 0; 357 ip->i_mode = 0; 358 ip->i_delaylen = 0; 359 ip->i_delayoff = 0; 360 ip->i_nextrio = 0; 361 ip->i_ufs_acl = NULL; 362 ip->i_cflags = 0; 363 ip->i_mapcnt = 0; 364 ip->i_dquot = NULL; 365 ip->i_cachedir = CD_ENABLED; 366 ip->i_writer = NULL; 367 368 /* 369 * the vnode for this inode was allocated by the constructor 370 */ 371 vp = ITOV(ip); 372 vn_reinit(vp); 373 if (ino == (ino_t)UFSROOTINO) 374 vp->v_flag = VROOT; 375 vp->v_vfsp = ufsvfsp->vfs_vfs; 376 vn_exists(vp); 377 return (ip); 378 } 379 380 /* 381 * Look up an inode by device, inumber. If it is in core (in the 382 * inode structure), honor the locking protocol. If it is not in 383 * core, read it in from the specified device after freeing any pages. 384 * In all cases, a pointer to a VN_HELD inode structure is returned. 385 */ 386 int 387 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr) 388 { 389 return (ufs_iget_internal(vfsp, ino, ipp, cr, 0)); 390 } 391 392 /* 393 * A version of ufs_iget which returns only allocated, linked inodes. 394 * This is appropriate for any callers who do not expect a free inode. 395 */ 396 int 397 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp, 398 struct cred *cr) 399 { 400 return (ufs_iget_internal(vfsp, ino, ipp, cr, 1)); 401 } 402 403 /* 404 * Set vnode attributes based on v_type, this should be called whenever 405 * an inode's i_mode is changed. 406 */ 407 void 408 ufs_reset_vnode(vnode_t *vp) 409 { 410 /* 411 * an old DBE hack 412 */ 413 if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) 414 vp->v_flag |= VSWAPLIKE; 415 else 416 vp->v_flag &= ~VSWAPLIKE; 417 418 /* 419 * if not swap like and it's just a regular file, we want 420 * to maintain the vnode's pages sorted by clean/modified 421 * for faster sync'ing to disk 422 */ 423 if (vp->v_type == VREG) 424 vp->v_flag |= VMODSORT; 425 else 426 vp->v_flag &= ~VMODSORT; 427 428 /* 429 * Is this an attribute hidden dir? 430 */ 431 if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR) 432 vp->v_flag |= V_XATTRDIR; 433 else 434 vp->v_flag &= ~V_XATTRDIR; 435 } 436 437 /* 438 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate' 439 * flag is used to distinguish the two; when true, we validate that the inode 440 * being retrieved looks like a linked and allocated inode. 441 */ 442 /* ARGSUSED */ 443 static int 444 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp, 445 struct cred *cr, int validate) 446 { 447 struct inode *ip, *sp; 448 union ihead *ih; 449 kmutex_t *ihm; 450 struct buf *bp; 451 struct dinode *dp; 452 struct vnode *vp; 453 extern vfs_t EIO_vfs; 454 int error; 455 int ftype; /* XXX - Remove later on */ 456 dev_t vfs_dev; 457 struct ufsvfs *ufsvfsp; 458 struct fs *fs; 459 int hno; 460 daddr_t bno; 461 ulong_t ioff; 462 463 CPU_STATS_ADD_K(sys, ufsiget, 1); 464 465 /* 466 * Lookup inode in cache. 467 */ 468 vfs_dev = vfsp->vfs_dev; 469 hno = INOHASH(ino); 470 ih = &ihead[hno]; 471 ihm = &ih_lock[hno]; 472 473 again: 474 mutex_enter(ihm); 475 for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) { 476 if (ino != ip->i_number || vfs_dev != ip->i_dev || 477 (ip->i_flag & ISTALE)) 478 continue; 479 480 /* 481 * Found the interesting inode; hold it and drop the cache lock 482 */ 483 vp = ITOV(ip); /* for locknest */ 484 VN_HOLD(vp); 485 mutex_exit(ihm); 486 rw_enter(&ip->i_contents, RW_READER); 487 488 /* 489 * if necessary, remove from idle list 490 */ 491 if ((ip->i_flag & IREF) == 0) { 492 if (ufs_rmidle(ip)) 493 VN_RELE(vp); 494 } 495 496 /* 497 * Could the inode be read from disk? 498 */ 499 if (ip->i_flag & ISTALE) { 500 rw_exit(&ip->i_contents); 501 VN_RELE(vp); 502 goto again; 503 } 504 505 ins.in_hits.value.ul++; 506 *ipp = ip; 507 508 /* 509 * Reset the vnode's attribute flags 510 */ 511 mutex_enter(&vp->v_lock); 512 ufs_reset_vnode(vp); 513 mutex_exit(&vp->v_lock); 514 515 rw_exit(&ip->i_contents); 516 517 return (0); 518 } 519 mutex_exit(ihm); 520 521 /* 522 * Inode was not in cache. 523 * 524 * Allocate a new entry 525 */ 526 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 527 fs = ufsvfsp->vfs_fs; 528 529 ip = ufs_alloc_inode(ufsvfsp, ino); 530 vp = ITOV(ip); 531 532 bno = fsbtodb(fs, itod(fs, ino)); 533 ioff = (sizeof (struct dinode)) * (itoo(fs, ino)); 534 ip->i_doff = (offset_t)ioff + ldbtob(bno); 535 536 /* 537 * put a place holder in the cache (if not already there) 538 */ 539 mutex_enter(ihm); 540 for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw) 541 if (ino == sp->i_number && vfs_dev == sp->i_dev && 542 ((sp->i_flag & ISTALE) == 0)) { 543 mutex_exit(ihm); 544 ufs_free_inode(ip); 545 goto again; 546 } 547 /* 548 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock)) 549 * here, but if we do, then shadow inode allocations panic the 550 * system. We don't have to hold vfs_dqrwlock for shadow inodes 551 * and the ufs_iget() parameters don't tell us what we are getting 552 * so we have no way of knowing this is a ufs_iget() call from 553 * a ufs_ialloc() call for a shadow inode. 554 */ 555 rw_enter(&ip->i_contents, RW_WRITER); 556 insque(ip, ih); 557 mutex_exit(ihm); 558 /* 559 * read the dinode 560 */ 561 bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize); 562 563 /* 564 * Check I/O errors 565 */ 566 error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0); 567 if (error) { 568 brelse(bp); 569 ip->i_flag |= ISTALE; /* in case someone is looking it up */ 570 rw_exit(&ip->i_contents); 571 vp->v_vfsp = &EIO_vfs; 572 VN_RELE(vp); 573 return (error); 574 } 575 /* 576 * initialize the inode's dinode 577 */ 578 dp = (struct dinode *)(ioff + bp->b_un.b_addr); 579 ip->i_ic = dp->di_ic; /* structure assignment */ 580 brelse(bp); 581 582 /* 583 * Maintain compatibility with Solaris 1.x UFS 584 */ 585 if (ip->i_suid != UID_LONG) 586 ip->i_uid = ip->i_suid; 587 if (ip->i_sgid != GID_LONG) 588 ip->i_gid = ip->i_sgid; 589 590 ftype = ip->i_mode & IFMT; 591 if (ftype == IFBLK || ftype == IFCHR) { 592 dev_t dv; 593 uint_t top16 = ip->i_ordev & 0xffff0000u; 594 595 if (top16 == 0 || top16 == 0xffff0000u) 596 dv = expdev(ip->i_ordev); 597 else 598 dv = expldev(ip->i_ordev); 599 vp->v_rdev = ip->i_rdev = dv; 600 } 601 602 /* 603 * if our caller only expects allocated inodes, verify that 604 * this inode looks good; throw it out if it's bad. 605 */ 606 if (validate) { 607 if ((ftype == 0) || (ip->i_nlink <= 0)) { 608 ip->i_flag |= ISTALE; 609 rw_exit(&ip->i_contents); 610 vp->v_vfsp = &EIO_vfs; 611 VN_RELE(vp); 612 cmn_err(CE_NOTE, 613 "%s: unexpected free inode %d, run fsck(1M)%s", 614 fs->fs_fsmnt, (int)ino, 615 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 616 return (EIO); 617 } 618 } 619 620 /* 621 * Finish initializing the vnode, special handling for shadow inodes 622 * because IFTOVT() will produce a v_type of VNON which is not what we 623 * want, set v_type to VREG explicitly in that case. 624 */ 625 if (ftype == IFSHAD) { 626 vp->v_type = VREG; 627 } else { 628 vp->v_type = IFTOVT((mode_t)ip->i_mode); 629 } 630 631 ufs_reset_vnode(vp); 632 633 /* 634 * read the shadow 635 */ 636 if (ftype != 0 && ip->i_shadow != 0) { 637 if ((error = ufs_si_load(ip, cr)) != 0) { 638 ip->i_flag |= ISTALE; 639 ip->i_ufs_acl = NULL; 640 rw_exit(&ip->i_contents); 641 vp->v_vfsp = &EIO_vfs; 642 VN_RELE(vp); 643 return (error); 644 } 645 } 646 647 /* 648 * Only attach quota information if the inode has a type and if 649 * that type is not a shadow inode. 650 */ 651 if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) && 652 ((ip->i_mode & IFMT) != IFATTRDIR)) { 653 ip->i_dquot = getinoquota(ip); 654 } 655 TRANS_MATA_IGET(ufsvfsp, ip); 656 *ipp = ip; 657 rw_exit(&ip->i_contents); 658 659 return (0); 660 } 661 662 /* 663 * Vnode is no longer referenced, write the inode out 664 * and if necessary, truncate and deallocate the file. 665 */ 666 void 667 ufs_iinactive(struct inode *ip) 668 { 669 int front; 670 struct inode *iq; 671 struct inode *hip; 672 struct ufs_q *uq; 673 struct vnode *vp = ITOV(ip); 674 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 675 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 676 677 /* 678 * Because the vnode type might have been changed, 679 * the dnlc_dir_purge must be called unconditionally. 680 */ 681 dnlc_dir_purge(&ip->i_danchor); 682 683 /* 684 * Get exclusive access to inode data. 685 */ 686 rw_enter(&ip->i_contents, RW_WRITER); 687 ASSERT(ip->i_flag & IREF); 688 689 /* 690 * Make sure no one reclaimed the inode before we put it on 691 * the freelist or destroy it. We keep our 'hold' on the vnode 692 * from vn_rele until we are ready to do something with the inode. 693 * 694 * Pageout may put a VN_HOLD/VN_RELE at anytime during this 695 * operation via an async putpage, so we must make sure 696 * we don't free/destroy the inode more than once. ufs_iget 697 * may also put a VN_HOLD on the inode before it grabs 698 * the i_contents lock. This is done so we don't free 699 * an inode that a thread is waiting on. 700 */ 701 mutex_enter(&vp->v_lock); 702 703 if (vp->v_count > 1) { 704 vp->v_count--; /* release our hold from vn_rele */ 705 mutex_exit(&vp->v_lock); 706 rw_exit(&ip->i_contents); 707 return; 708 } 709 mutex_exit(&vp->v_lock); 710 711 /* 712 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed 713 * and clean. It can be safely destroyed (cyf). 714 */ 715 if (ip->i_ufsvfs == NULL) { 716 rw_exit(&ip->i_contents); 717 ufs_si_del(ip); 718 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 719 ufs_free_inode(ip); 720 return; 721 } 722 723 /* 724 * queue idle inode to appropriate thread. Will check v_count == 1 725 * prior to putting this on the appropriate queue. 726 * Stale inodes will be unhashed and freed by the ufs idle thread 727 * in ufs_idle_free() 728 */ 729 front = 1; 730 if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 && 731 ip->i_mode && ip->i_nlink <= 0) { 732 /* 733 * Mark the i_flag to indicate that inode is being deleted. 734 * This flag will be cleared when the deletion is complete. 735 * This prevents nfs from sneaking in via ufs_vget() while 736 * the delete is in progress (bugid 1242481). 737 */ 738 ip->i_flag |= IDEL; 739 740 /* 741 * NOIDEL means that deletes are not allowed at this time; 742 * whoever resets NOIDEL will also send this inode back 743 * through ufs_iinactive. IREF remains set. 744 */ 745 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 746 mutex_enter(&vp->v_lock); 747 vp->v_count--; 748 mutex_exit(&vp->v_lock); 749 rw_exit(&ip->i_contents); 750 return; 751 } 752 if (!TRANS_ISTRANS(ip->i_ufsvfs)) { 753 rw_exit(&ip->i_contents); 754 ufs_delete(ip->i_ufsvfs, ip, 0); 755 return; 756 } 757 758 /* queue to delete thread; IREF remains set */ 759 ins.in_qfree.value.ul++; 760 uq = &ip->i_ufsvfs->vfs_delete; 761 762 mutex_enter(&uq->uq_mutex); 763 764 /* add to q */ 765 if ((iq = uq->uq_ihead) != 0) { 766 ip->i_freef = iq; 767 ip->i_freeb = iq->i_freeb; 768 iq->i_freeb->i_freef = ip; 769 iq->i_freeb = ip; 770 if (front) 771 uq->uq_ihead = ip; 772 } else { 773 uq->uq_ihead = ip; 774 ip->i_freef = ip; 775 ip->i_freeb = ip; 776 } 777 778 delq_info->delq_unreclaimed_files += 1; 779 delq_info->delq_unreclaimed_blocks += ip->i_blocks; 780 } else { 781 /* 782 * queue to idle thread 783 * Check the v_count == 1 again. 784 * 785 */ 786 mutex_enter(&vp->v_lock); 787 if (vp->v_count > 1) { 788 vp->v_count--; /* release our hold from vn_rele */ 789 mutex_exit(&vp->v_lock); 790 rw_exit(&ip->i_contents); 791 return; 792 } 793 mutex_exit(&vp->v_lock); 794 uq = &ufs_idle_q; 795 796 /* 797 * useful iff it has pages or is a fastsymlink; otherwise junk 798 */ 799 mutex_enter(&uq->uq_mutex); 800 801 /* clear IREF means `on idle list' */ 802 ip->i_flag &= ~(IREF | IDIRECTIO); 803 804 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) { 805 ins.in_frback.value.ul++; 806 hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)]; 807 ufs_nuseful_iq++; 808 } else { 809 ins.in_frfront.value.ul++; 810 hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)]; 811 ip->i_flag |= IJUNKIQ; 812 ufs_njunk_iq++; 813 } 814 ip->i_freef = hip; 815 ip->i_freeb = hip->i_freeb; 816 hip->i_freeb->i_freef = ip; 817 hip->i_freeb = ip; 818 } 819 820 /* wakeup thread(s) if q is overfull */ 821 if (++uq->uq_ne == uq->uq_lowat) 822 cv_broadcast(&uq->uq_cv); 823 824 /* all done, release the q and inode */ 825 mutex_exit(&uq->uq_mutex); 826 rw_exit(&ip->i_contents); 827 } 828 829 /* 830 * Check accessed and update flags on an inode structure. 831 * If any are on, update the inode with the (unique) current time. 832 * If waitfor is given, insure I/O order so wait for write to complete. 833 */ 834 void 835 ufs_iupdat(struct inode *ip, int waitfor) 836 { 837 struct buf *bp; 838 struct fs *fp; 839 struct dinode *dp; 840 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 841 int i; 842 int do_trans_times; 843 ushort_t flag; 844 o_uid_t suid; 845 o_gid_t sgid; 846 847 /* 848 * This function is now safe to be called with either the reader 849 * or writer i_contents lock. 850 */ 851 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 852 853 /* 854 * Return if file system has been forcibly umounted. 855 */ 856 if (ufsvfsp == NULL) 857 return; 858 859 flag = ip->i_flag; /* Atomic read */ 860 /* 861 * We better not update the disk inode from a stale inode. 862 */ 863 if (flag & ISTALE) 864 return; 865 866 fp = ip->i_fs; 867 868 if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) { 869 if (fp->fs_ronly) { 870 mutex_enter(&ip->i_tlock); 871 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 872 mutex_exit(&ip->i_tlock); 873 return; 874 } 875 /* 876 * fs is active while metadata is being written 877 */ 878 mutex_enter(&ufsvfsp->vfs_lock); 879 ufs_notclean(ufsvfsp); 880 /* 881 * get the dinode 882 */ 883 bp = UFS_BREAD(ufsvfsp, ip->i_dev, 884 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)), 885 (int)fp->fs_bsize); 886 if (bp->b_flags & B_ERROR) { 887 mutex_enter(&ip->i_tlock); 888 ip->i_flag &= 889 ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 890 mutex_exit(&ip->i_tlock); 891 brelse(bp); 892 return; 893 } 894 /* 895 * munge inode fields 896 */ 897 mutex_enter(&ip->i_tlock); 898 ITIMES_NOLOCK(ip); 899 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC); 900 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 901 mutex_exit(&ip->i_tlock); 902 903 /* 904 * For reads and concurrent re-writes, no deltas were 905 * entered for the access time changes - do it now. 906 */ 907 if (do_trans_times) { 908 TRANS_INODE_TIMES(ufsvfsp, ip); 909 } 910 911 /* 912 * For SunOS 5.0->5.4, these lines below read: 913 * 914 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 915 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 916 * 917 * where MAXUID was set to 60002. This was incorrect - 918 * the uids should have been constrained to what fitted into 919 * a 16-bit word. 920 * 921 * This means that files from 4.x filesystems that have an 922 * i_suid field larger than 60002 will have that field 923 * changed to 65535. 924 * 925 * Security note: 4.x UFS could never create a i_suid of 926 * UID_LONG since that would've corresponded to -1. 927 */ 928 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 929 UID_LONG : ip->i_uid; 930 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 931 GID_LONG : ip->i_gid; 932 933 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) { 934 ip->i_suid = suid; 935 ip->i_sgid = sgid; 936 TRANS_INODE(ufsvfsp, ip); 937 } 938 939 if ((ip->i_mode & IFMT) == IFBLK || 940 (ip->i_mode & IFMT) == IFCHR) { 941 dev_t d = ip->i_rdev; 942 dev32_t dev32; 943 944 /* 945 * load first direct block only if special device 946 */ 947 if (!cmpldev(&dev32, d)) { 948 /* 949 * We panic here because there's "no way" 950 * we should have been able to create a large 951 * inode with a large dev_t. Earlier layers 952 * should've caught this. 953 */ 954 panic("ip %p: i_rdev too big", (void *)ip); 955 } 956 957 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 958 ip->i_ordev = dev32; /* can't use old fmt. */ 959 } else { 960 ip->i_ordev = cmpdev(d); 961 } 962 } 963 964 /* 965 * copy inode to dinode (zero fastsymlnk in dinode) 966 */ 967 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number); 968 dp->di_ic = ip->i_ic; /* structure assignment */ 969 if (flag & IFASTSYMLNK) { 970 for (i = 1; i < NDADDR; i++) 971 dp->di_db[i] = 0; 972 for (i = 0; i < NIADDR; i++) 973 dp->di_ib[i] = 0; 974 } 975 if (TRANS_ISTRANS(ufsvfsp)) { 976 /* 977 * Pass only a sector size buffer containing 978 * the inode, otherwise when the buffer is copied 979 * into a cached roll buffer then too much memory 980 * gets consumed if 8KB inode buffers are passed. 981 */ 982 TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff, 983 sizeof (struct dinode), 984 (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE), 985 DEV_BSIZE); 986 987 brelse(bp); 988 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) { 989 UFS_BRWRITE(ufsvfsp, bp); 990 991 /* 992 * Synchronous write has guaranteed that inode 993 * has been written on disk so clear the flag 994 */ 995 mutex_enter(&ip->i_tlock); 996 ip->i_flag &= ~IBDWRITE; 997 mutex_exit(&ip->i_tlock); 998 } else { 999 bdrwrite(bp); 1000 1001 /* 1002 * This write hasn't guaranteed that inode has been 1003 * written on the disk. 1004 * Since, all updat flags on inode are cleared, we must 1005 * remember the condition in case inode is to be updated 1006 * synchronously later (e.g.- fsync()/fdatasync()) 1007 * and inode has not been modified yet. 1008 */ 1009 mutex_enter(&ip->i_tlock); 1010 ip->i_flag |= IBDWRITE; 1011 mutex_exit(&ip->i_tlock); 1012 } 1013 } else { 1014 /* 1015 * In case previous inode update was done asynchronously 1016 * (IBDWRITE) and this inode update request wants guaranteed 1017 * (synchronous) disk update, flush the inode. 1018 */ 1019 if (waitfor && (flag & IBDWRITE)) { 1020 blkflush(ip->i_dev, 1021 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number))); 1022 mutex_enter(&ip->i_tlock); 1023 ip->i_flag &= ~IBDWRITE; 1024 mutex_exit(&ip->i_tlock); 1025 } 1026 } 1027 } 1028 1029 #define SINGLE 0 /* index of single indirect block */ 1030 #define DOUBLE 1 /* index of double indirect block */ 1031 #define TRIPLE 2 /* index of triple indirect block */ 1032 1033 /* 1034 * Release blocks associated with the inode ip and 1035 * stored in the indirect block bn. Blocks are free'd 1036 * in LIFO order up to (but not including) lastbn. If 1037 * level is greater than SINGLE, the block is an indirect 1038 * block and recursive calls to indirtrunc must be used to 1039 * cleanse other indirect blocks. 1040 * 1041 * N.B.: triple indirect blocks are untested. 1042 */ 1043 static long 1044 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags) 1045 { 1046 int i; 1047 struct buf *bp, *copy; 1048 daddr32_t *bap; 1049 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1050 struct fs *fs = ufsvfsp->vfs_fs; 1051 daddr_t nb, last; 1052 long factor; 1053 int blocksreleased = 0, nblocks; 1054 1055 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 1056 /* 1057 * Calculate index in current block of last 1058 * block to be kept. -1 indicates the entire 1059 * block so we need not calculate the index. 1060 */ 1061 factor = 1; 1062 for (i = SINGLE; i < level; i++) 1063 factor *= NINDIR(fs); 1064 last = lastbn; 1065 if (lastbn > 0) 1066 last /= factor; 1067 nblocks = btodb(fs->fs_bsize); 1068 /* 1069 * Get buffer of block pointers, zero those 1070 * entries corresponding to blocks to be free'd, 1071 * and update on disk copy first. 1072 * *Unless* the root pointer has been synchronously 1073 * written to disk. If nothing points to this 1074 * indirect block then don't bother zero'ing and 1075 * writing it. 1076 */ 1077 bp = UFS_BREAD(ufsvfsp, 1078 ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize); 1079 if (bp->b_flags & B_ERROR) { 1080 brelse(bp); 1081 return (0); 1082 } 1083 bap = bp->b_un.b_daddr; 1084 if ((flags & I_CHEAP) == 0) { 1085 uint_t zb; 1086 1087 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t)); 1088 1089 if (zb) { 1090 /* 1091 * push any data into the log before we zero it 1092 */ 1093 if (bp->b_flags & B_DELWRI) 1094 TRANS_LOG(ufsvfsp, (caddr_t)bap, 1095 ldbtob(bp->b_blkno), bp->b_bcount, 1096 bp->b_un.b_addr, bp->b_bcount); 1097 copy = ngeteblk(fs->fs_bsize); 1098 bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr, 1099 (uint_t)fs->fs_bsize); 1100 bzero((caddr_t)&bap[last + 1], zb); 1101 1102 TRANS_BUF(ufsvfsp, 1103 (caddr_t)&bap[last + 1] - (caddr_t)bap, 1104 zb, bp, DT_ABZERO); 1105 1106 UFS_BRWRITE(ufsvfsp, bp); 1107 bp = copy, bap = bp->b_un.b_daddr; 1108 } 1109 } else { 1110 /* make sure write retries are also cleared */ 1111 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI); 1112 bp->b_flags |= B_STALE | B_AGE; 1113 } 1114 1115 /* 1116 * Recursively free totally unused blocks. 1117 */ 1118 flags |= I_CHEAP; 1119 for (i = NINDIR(fs) - 1; i > last; i--) { 1120 nb = bap[i]; 1121 if (nb == 0) 1122 continue; 1123 if (level > SINGLE) { 1124 blocksreleased += 1125 indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags); 1126 free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK); 1127 } else 1128 free(ip, nb, (off_t)fs->fs_bsize, flags); 1129 blocksreleased += nblocks; 1130 } 1131 flags &= ~I_CHEAP; 1132 1133 /* 1134 * Recursively free last partial block. 1135 */ 1136 if (level > SINGLE && lastbn >= 0) { 1137 last = lastbn % factor; 1138 nb = bap[i]; 1139 if (nb != 0) 1140 blocksreleased += 1141 indirtrunc(ip, nb, last, level - 1, flags); 1142 } 1143 brelse(bp); 1144 return (blocksreleased); 1145 } 1146 1147 /* 1148 * Truncate the inode ip to at most length size. 1149 * Free affected disk blocks -- the blocks of the 1150 * file are removed in reverse order. 1151 * 1152 * N.B.: triple indirect blocks are untested. 1153 */ 1154 static int i_genrand = 1234; 1155 int 1156 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr) 1157 { 1158 struct fs *fs = oip->i_fs; 1159 struct ufsvfs *ufsvfsp = oip->i_ufsvfs; 1160 struct inode *ip; 1161 daddr_t lastblock; 1162 off_t bsize; 1163 int boff; 1164 daddr_t bn, lastiblock[NIADDR]; 1165 int level; 1166 long nblocks, blocksreleased = 0; 1167 int i; 1168 ushort_t mode; 1169 struct inode tip; 1170 int err; 1171 u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ? 1172 (UFS_MAXOFFSET_T) : (MAXOFF32_T); 1173 1174 /* 1175 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most 1176 * other uses need the reader lock. opendq() holds the writer lock. 1177 */ 1178 ASSERT((oip->i_mode & IFMT) == IFSHAD || 1179 RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 1180 ASSERT(RW_WRITE_HELD(&oip->i_contents)); 1181 /* 1182 * We only allow truncation of regular files and directories 1183 * to arbitrary lengths here. In addition, we allow symbolic 1184 * links to be truncated only to zero length. Other inode 1185 * types cannot have their length set here. Disk blocks are 1186 * being dealt with - especially device inodes where 1187 * ip->i_ordev is actually being stored in ip->i_db[0]! 1188 */ 1189 TRANS_INODE(ufsvfsp, oip); 1190 mode = oip->i_mode & IFMT; 1191 if (flags & I_FREE) { 1192 i_genrand *= 16843009; /* turns into shift and adds */ 1193 i_genrand++; 1194 oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1; 1195 oip->i_flag |= ICHG |IUPD; 1196 oip->i_seq++; 1197 if (length == oip->i_size) 1198 return (0); 1199 flags |= I_CHEAP; 1200 } 1201 if (mode == IFIFO) 1202 return (0); 1203 if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR && 1204 !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD) 1205 return (EINVAL); 1206 if (length > maxoffset) 1207 return (EFBIG); 1208 if ((mode == IFDIR) || (mode == IFATTRDIR)) 1209 flags |= I_DIR; 1210 if (mode == IFSHAD) 1211 flags |= I_SHAD; 1212 if (oip == ufsvfsp->vfs_qinod) 1213 flags |= I_QUOTA; 1214 if (length == oip->i_size) { 1215 /* update ctime and mtime to please POSIX tests */ 1216 oip->i_flag |= ICHG |IUPD; 1217 oip->i_seq++; 1218 if (length == 0) { 1219 /* nothing to cache so clear the flag */ 1220 oip->i_flag &= ~IFASTSYMLNK; 1221 } 1222 return (0); 1223 } 1224 /* wipe out fast symlink till next access */ 1225 if (oip->i_flag & IFASTSYMLNK) { 1226 int j; 1227 1228 ASSERT(ITOV(oip)->v_type == VLNK); 1229 1230 oip->i_flag &= ~IFASTSYMLNK; 1231 1232 for (j = 1; j < NDADDR; j++) 1233 oip->i_db[j] = 0; 1234 for (j = 0; j < NIADDR; j++) 1235 oip->i_ib[j] = 0; 1236 } 1237 1238 boff = (int)blkoff(fs, length); 1239 1240 if (length > oip->i_size) { 1241 /* 1242 * Trunc up case. BMAPALLOC will insure that the right blocks 1243 * are allocated. This includes extending the old frag to a 1244 * full block (if needed) in addition to doing any work 1245 * needed for allocating the last block. 1246 */ 1247 if (boff == 0) 1248 err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr); 1249 else 1250 err = BMAPALLOC(oip, length - 1, boff, cr); 1251 1252 if (err == 0) { 1253 /* 1254 * Save old size and set inode's size now 1255 * so that we don't cause too much of the 1256 * file to be zero'd and pushed. 1257 */ 1258 u_offset_t osize = oip->i_size; 1259 oip->i_size = length; 1260 /* 1261 * Make sure we zero out the remaining bytes of 1262 * the page in case a mmap scribbled on it. We 1263 * can't prevent a mmap from writing beyond EOF 1264 * on the last page of a file. 1265 * 1266 */ 1267 if ((boff = (int)blkoff(fs, osize)) != 0) { 1268 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ? 1269 fs->fs_bsize : fragroundup(fs, boff); 1270 pvn_vpzero(ITOV(oip), osize, 1271 (size_t)(bsize - boff)); 1272 } 1273 oip->i_flag |= ICHG|IATTCHG; 1274 oip->i_seq++; 1275 ITIMES_NOLOCK(oip); 1276 /* 1277 * MAXOFF32_T is old 2GB size limit. If 1278 * this operation caused a large file to be 1279 * created, turn on the superblock flag 1280 * and update the superblock, if the flag 1281 * is not already on. 1282 */ 1283 if ((length > (u_offset_t)MAXOFF32_T) && 1284 !(fs->fs_flags & FSLARGEFILES)) { 1285 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1286 mutex_enter(&ufsvfsp->vfs_lock); 1287 fs->fs_flags |= FSLARGEFILES; 1288 ufs_sbwrite(ufsvfsp); 1289 mutex_exit(&ufsvfsp->vfs_lock); 1290 } 1291 } 1292 1293 return (err); 1294 } 1295 1296 /* 1297 * Update the pages of the file. If the file is not being 1298 * truncated to a block boundary, the contents of the 1299 * pages following the end of the file must be zero'ed 1300 * in case it ever become accessible again because 1301 * of subsequent file growth. 1302 */ 1303 if (boff == 0) { 1304 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage, 1305 B_INVAL | B_TRUNC, CRED()); 1306 } else { 1307 /* 1308 * Make sure that the last block is properly allocated. 1309 * We only really have to do this if the last block is 1310 * actually allocated since ufs_bmap will now handle the case 1311 * of an fragment which has no block allocated. Just to 1312 * be sure, we do it now independent of current allocation. 1313 */ 1314 err = BMAPALLOC(oip, length - 1, boff, cr); 1315 if (err) 1316 return (err); 1317 1318 /* 1319 * BMAPALLOC will call bmap_write which defers i_seq 1320 * processing. If the timestamps were changed, update 1321 * i_seq before rdip drops i_contents or syncs the inode. 1322 */ 1323 if (oip->i_flag & (ICHG|IUPD)) 1324 oip->i_seq++; 1325 1326 /* 1327 * BugId 4069932 1328 * Make sure that the relevant partial page appears in 1329 * the v_pages list, so that pvn_vpzero() will do its 1330 * job. Since doing this correctly requires everything 1331 * in rdip() except for the uiomove(), it's easier and 1332 * safer to do the uiomove() rather than duplicate the 1333 * rest of rdip() here. 1334 * 1335 * To get here, we know that length indicates a byte 1336 * that is not the first byte of a block. (length - 1) 1337 * is the last actual byte known to exist. Deduction 1338 * shows it is in the same block as byte (length). 1339 * Thus, this rdip() invocation should always succeed 1340 * except in the face of i/o errors, and give us the 1341 * block we care about. 1342 * 1343 * rdip() makes the same locking assertions and 1344 * assumptions as we do. We do not acquire any locks 1345 * before calling it, so we have not changed the locking 1346 * situation. Finally, there do not appear to be any 1347 * paths whereby rdip() ends up invoking us again. 1348 * Thus, infinite recursion is avoided. 1349 */ 1350 { 1351 uio_t uio; 1352 iovec_t iov[1]; 1353 char buffer; 1354 1355 uio.uio_iov = iov; 1356 uio.uio_iovcnt = 1; 1357 uio.uio_loffset = length - 1; 1358 uio.uio_resid = 1; 1359 uio.uio_segflg = UIO_SYSSPACE; 1360 uio.uio_extflg = UIO_COPY_CACHED; 1361 1362 iov[0].iov_base = &buffer; 1363 iov[0].iov_len = 1; 1364 1365 err = rdip(oip, &uio, UIO_READ, NULL); 1366 if (err) 1367 return (err); 1368 } 1369 1370 bsize = (int)lblkno(fs, length - 1) >= NDADDR ? 1371 fs->fs_bsize : fragroundup(fs, boff); 1372 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff)); 1373 /* 1374 * Ensure full fs block is marked as dirty. 1375 */ 1376 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff), 1377 ufs_putapage, B_INVAL | B_TRUNC, CRED()); 1378 } 1379 1380 /* 1381 * Calculate index into inode's block list of 1382 * last direct and indirect blocks (if any) 1383 * which we want to keep. Lastblock is -1 when 1384 * the file is truncated to 0. 1385 */ 1386 lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; 1387 lastiblock[SINGLE] = lastblock - NDADDR; 1388 lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); 1389 lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); 1390 nblocks = btodb(fs->fs_bsize); 1391 1392 /* 1393 * Update file and block pointers 1394 * on disk before we start freeing blocks. 1395 * If we crash before free'ing blocks below, 1396 * the blocks will be returned to the free list. 1397 * lastiblock values are also normalized to -1 1398 * for calls to indirtrunc below. 1399 */ 1400 tip = *oip; /* structure copy */ 1401 ip = &tip; 1402 1403 for (level = TRIPLE; level >= SINGLE; level--) 1404 if (lastiblock[level] < 0) { 1405 oip->i_ib[level] = 0; 1406 lastiblock[level] = -1; 1407 } 1408 for (i = NDADDR - 1; i > lastblock; i--) { 1409 oip->i_db[i] = 0; 1410 flags |= I_CHEAP; 1411 } 1412 oip->i_size = length; 1413 oip->i_flag |= ICHG|IUPD|IATTCHG; 1414 oip->i_seq++; 1415 if (!TRANS_ISTRANS(ufsvfsp)) 1416 ufs_iupdat(oip, I_SYNC); /* do sync inode update */ 1417 1418 /* 1419 * Indirect blocks first. 1420 */ 1421 for (level = TRIPLE; level >= SINGLE; level--) { 1422 bn = ip->i_ib[level]; 1423 if (bn != 0) { 1424 blocksreleased += 1425 indirtrunc(ip, bn, lastiblock[level], level, flags); 1426 if (lastiblock[level] < 0) { 1427 ip->i_ib[level] = 0; 1428 free(ip, bn, (off_t)fs->fs_bsize, 1429 flags | I_IBLK); 1430 blocksreleased += nblocks; 1431 } 1432 } 1433 if (lastiblock[level] >= 0) 1434 goto done; 1435 } 1436 1437 /* 1438 * All whole direct blocks or frags. 1439 */ 1440 for (i = NDADDR - 1; i > lastblock; i--) { 1441 bn = ip->i_db[i]; 1442 if (bn == 0) 1443 continue; 1444 ip->i_db[i] = 0; 1445 bsize = (off_t)blksize(fs, ip, i); 1446 free(ip, bn, bsize, flags); 1447 blocksreleased += btodb(bsize); 1448 } 1449 if (lastblock < 0) 1450 goto done; 1451 1452 /* 1453 * Finally, look for a change in size of the 1454 * last direct block; release any frags. 1455 */ 1456 bn = ip->i_db[lastblock]; 1457 if (bn != 0) { 1458 off_t oldspace, newspace; 1459 1460 /* 1461 * Calculate amount of space we're giving 1462 * back as old block size minus new block size. 1463 */ 1464 oldspace = blksize(fs, ip, lastblock); 1465 UFS_SET_ISIZE(length, ip); 1466 newspace = blksize(fs, ip, lastblock); 1467 if (newspace == 0) { 1468 err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0"); 1469 return (err); 1470 } 1471 if (oldspace - newspace > 0) { 1472 /* 1473 * Block number of space to be free'd is 1474 * the old block # plus the number of frags 1475 * required for the storage we're keeping. 1476 */ 1477 bn += numfrags(fs, newspace); 1478 free(ip, bn, oldspace - newspace, flags); 1479 blocksreleased += btodb(oldspace - newspace); 1480 } 1481 } 1482 done: 1483 /* BEGIN PARANOIA */ 1484 for (level = SINGLE; level <= TRIPLE; level++) 1485 if (ip->i_ib[level] != oip->i_ib[level]) { 1486 err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block"); 1487 return (err); 1488 } 1489 1490 for (i = 0; i < NDADDR; i++) 1491 if (ip->i_db[i] != oip->i_db[i]) { 1492 err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block"); 1493 return (err); 1494 } 1495 /* END PARANOIA */ 1496 oip->i_blocks -= blocksreleased; 1497 1498 if (oip->i_blocks < 0) { /* sanity */ 1499 cmn_err(CE_NOTE, 1500 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n", 1501 fs->fs_fsmnt, (int)oip->i_number, oip->i_size, 1502 (int)oip->i_blocks); 1503 oip->i_blocks = 0; 1504 } 1505 oip->i_flag |= ICHG|IATTCHG; 1506 oip->i_seq++; 1507 /* blocksreleased is >= zero, so this can not fail */ 1508 (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL, 1509 (size_t *)NULL); 1510 return (0); 1511 } 1512 1513 /* 1514 * Check mode permission on inode. Mode is READ, WRITE or EXEC. 1515 * In the case of WRITE, the read-only status of the file system 1516 * is checked. Depending on the calling user, the appropriate 1517 * mode bits are selected; privileges to override missing permission 1518 * bits are checked through secpolicy_vnode_access(). 1519 * The i_contens lock must be held as reader here to prevent racing with 1520 * the acl subsystem removing/setting/changing acls on this inode. 1521 * The caller is responsible for indicating whether or not the i_contents 1522 * lock needs to be acquired here or if already held. 1523 */ 1524 int 1525 ufs_iaccess(struct inode *ip, int mode, struct cred *cr, int dolock) 1526 { 1527 int shift = 0; 1528 int ret = 0; 1529 1530 if (dolock) 1531 rw_enter(&ip->i_contents, RW_READER); 1532 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 1533 1534 if (mode & IWRITE) { 1535 /* 1536 * Disallow write attempts on read-only 1537 * file systems, unless the file is a block 1538 * or character device or a FIFO. 1539 */ 1540 if (ip->i_fs->fs_ronly != 0) { 1541 if ((ip->i_mode & IFMT) != IFCHR && 1542 (ip->i_mode & IFMT) != IFBLK && 1543 (ip->i_mode & IFMT) != IFIFO) { 1544 ret = EROFS; 1545 goto out; 1546 } 1547 } 1548 } 1549 /* 1550 * If there is an acl, check the acl and return. 1551 */ 1552 if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) { 1553 ret = ufs_acl_access(ip, mode, cr); 1554 goto out; 1555 } 1556 1557 /* 1558 * Access check is based on only one of owner, group, public. 1559 * If not owner, then check group. 1560 * If not a member of the group, then check public access. 1561 */ 1562 if (crgetuid(cr) != ip->i_uid) { 1563 shift += 3; 1564 if (!groupmember((uid_t)ip->i_gid, cr)) 1565 shift += 3; 1566 } 1567 1568 /* test missing privilege bits */ 1569 ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid, 1570 ip->i_mode << shift, mode); 1571 out: 1572 if (dolock) 1573 rw_exit(&ip->i_contents); 1574 return (ret); 1575 } 1576 1577 /* 1578 * if necessary, remove an inode from the free list 1579 * i_contents is held except at unmount 1580 * 1581 * Return 1 if the inode is taken off of the ufs_idle_q, 1582 * and the caller is expected to call VN_RELE. 1583 * 1584 * Return 0 otherwise. 1585 */ 1586 int 1587 ufs_rmidle(struct inode *ip) 1588 { 1589 int rval = 0; 1590 1591 mutex_enter(&ip->i_tlock); 1592 if ((ip->i_flag & IREF) == 0) { 1593 mutex_enter(&ufs_idle_q.uq_mutex); 1594 ip->i_freef->i_freeb = ip->i_freeb; 1595 ip->i_freeb->i_freef = ip->i_freef; 1596 ip->i_freef = ip; 1597 ip->i_freeb = ip; 1598 ip->i_flag |= IREF; 1599 ufs_idle_q.uq_ne--; 1600 if (ip->i_flag & IJUNKIQ) { 1601 ufs_njunk_iq--; 1602 ip->i_flag &= ~IJUNKIQ; 1603 } else { 1604 ufs_nuseful_iq--; 1605 } 1606 mutex_exit(&ufs_idle_q.uq_mutex); 1607 rval = 1; 1608 } 1609 mutex_exit(&ip->i_tlock); 1610 return (rval); 1611 } 1612 1613 /* 1614 * scan the hash of inodes and call func with the inode locked 1615 */ 1616 int 1617 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg, 1618 struct ufsvfs *ufsvfsp) 1619 { 1620 struct inode *ip; /* current inode */ 1621 struct inode *lip = NULL; /* last/previous inode */ 1622 union ihead *ih; /* current hash chain */ 1623 int error, i; 1624 int saverror = 0; 1625 int lip_held; /* lip needs a VN_RELE() */ 1626 1627 /* 1628 * If ufsvfsp is NULL, then our caller should be holding 1629 * ufs_scan_lock to avoid conflicts between ufs_unmount() and 1630 * ufs_update(). Otherwise, to avoid false-positives in 1631 * ufs_unmount()'s v_count-based EBUSY check, we only hold 1632 * those inodes that are in the file system our caller cares 1633 * about. 1634 * 1635 * We know that ip is a valid inode in the hash chain (and thus 1636 * we can trust i_ufsvfs) because the inode we chained from 1637 * (lip) is still in the hash chain. This is true because either: 1638 * 1639 * 1. We did not drop the hash chain lock since the last 1640 * iteration (because we were not interested in the last inode), 1641 * or 1642 * 2. We maintained a hold on the last inode while we 1643 * we were processing it, so it could not be removed 1644 * from the hash chain. 1645 * 1646 * The whole reason we're dropping and re-grabbing the chain 1647 * lock on every inode is so that we don't present a major 1648 * choke point on throughput, particularly when we've been 1649 * called on behalf of fsflush. 1650 */ 1651 1652 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 1653 mutex_enter(&ih_lock[i]); 1654 for (ip = ih->ih_chain[0], lip_held = 0; 1655 ip != (struct inode *)ih; 1656 ip = lip->i_forw) { 1657 1658 ins.in_scan.value.ul++; 1659 1660 /* 1661 * Undo the previous iteration's VN_HOLD(), but 1662 * only if one was done. 1663 */ 1664 if (lip_held) 1665 VN_RELE(ITOV(lip)); 1666 1667 lip = ip; 1668 if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) { 1669 /* 1670 * We're not processing all inodes, and 1671 * this inode is not in the filesystem of 1672 * interest, so skip it. No need to do a 1673 * VN_HOLD() since we're not dropping the 1674 * hash chain lock until after we've 1675 * done the i_forw traversal above. 1676 */ 1677 lip_held = 0; 1678 continue; 1679 } 1680 VN_HOLD(ITOV(ip)); 1681 lip_held = 1; 1682 mutex_exit(&ih_lock[i]); 1683 1684 /* 1685 * Acquire the contents lock as writer to make 1686 * sure that the inode has been initialized in 1687 * the cache or removed from the idle list by 1688 * ufs_iget(). This works because ufs_iget() 1689 * acquires the contents lock before putting 1690 * the inode into the cache. If we can lock 1691 * it, then he's done with it. 1692 */ 1693 1694 if (rwtry) { 1695 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) { 1696 mutex_enter(&ih_lock[i]); 1697 continue; 1698 } 1699 } else { 1700 rw_enter(&ip->i_contents, RW_WRITER); 1701 } 1702 1703 rw_exit(&ip->i_contents); 1704 1705 /* 1706 * ISTALE means the inode couldn't be read 1707 * 1708 * We don't have to hold the i_contents lock 1709 * for this check for a couple of 1710 * reasons. First, if ISTALE is set then the 1711 * flag cannot be cleared until the inode is 1712 * removed from the cache and that cannot 1713 * happen until after we VN_RELE() it. 1714 * Second, if ISTALE is not set, then the 1715 * inode is in the cache and does not need to 1716 * be read from disk so ISTALE cannot be set 1717 * while we are not looking. 1718 */ 1719 if ((ip->i_flag & ISTALE) == 0) { 1720 if ((error = (*func)(ip, arg)) != 0) 1721 saverror = error; 1722 } 1723 1724 mutex_enter(&ih_lock[i]); 1725 } 1726 if (lip_held) 1727 VN_RELE(ITOV(lip)); 1728 mutex_exit(&ih_lock[i]); 1729 } 1730 return (saverror); 1731 } 1732 1733 /* 1734 * Mark inode with the current time, plus a unique increment. 1735 * 1736 * Since we only keep 32-bit time on disk, if UFS is still alive 1737 * beyond 2038, filesystem times will simply stick at the last 1738 * possible second of 32-bit time. Not ideal, but probably better 1739 * than going into the remote past, or confusing applications with 1740 * negative time. 1741 */ 1742 void 1743 ufs_imark(struct inode *ip) 1744 { 1745 timestruc_t now; 1746 int32_t usec, nsec; 1747 1748 /* 1749 * The update of i_seq may have been deferred, increase i_seq here 1750 * to make sure it is in sync with the timestamps. 1751 */ 1752 if (ip->i_flag & ISEQ) { 1753 ASSERT(ip->i_flag & (IUPD|ICHG)); 1754 ip->i_seq++; 1755 ip->i_flag &= ~ISEQ; 1756 } 1757 1758 gethrestime(&now); 1759 1760 /* 1761 * Fast algorithm to convert nsec to usec -- see hrt2ts() 1762 * in common/os/timers.c for a full description. 1763 */ 1764 nsec = now.tv_nsec; 1765 usec = nsec + (nsec >> 2); 1766 usec = nsec + (usec >> 1); 1767 usec = nsec + (usec >> 2); 1768 usec = nsec + (usec >> 4); 1769 usec = nsec - (usec >> 3); 1770 usec = nsec + (usec >> 2); 1771 usec = nsec + (usec >> 3); 1772 usec = nsec + (usec >> 4); 1773 usec = nsec + (usec >> 1); 1774 usec = nsec + (usec >> 6); 1775 usec = usec >> 10; 1776 1777 mutex_enter(&ufs_iuniqtime_lock); 1778 if (now.tv_sec > (time_t)iuniqtime.tv_sec || 1779 usec > iuniqtime.tv_usec) { 1780 if (now.tv_sec < TIME32_MAX) { 1781 iuniqtime.tv_sec = (time32_t)now.tv_sec; 1782 iuniqtime.tv_usec = usec; 1783 } 1784 } else { 1785 if (iuniqtime.tv_sec < TIME32_MAX) { 1786 iuniqtime.tv_usec++; 1787 /* Check for usec overflow */ 1788 if (iuniqtime.tv_usec >= MICROSEC) { 1789 iuniqtime.tv_sec++; 1790 iuniqtime.tv_usec = 0; 1791 } 1792 } 1793 } 1794 1795 if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) { 1796 ip->i_atime = iuniqtime; 1797 } 1798 if (ip->i_flag & IUPD) { 1799 ip->i_mtime = iuniqtime; 1800 ip->i_flag |= IMODTIME; 1801 } 1802 if (ip->i_flag & ICHG) { 1803 ip->i_diroff = 0; 1804 ip->i_ctime = iuniqtime; 1805 } 1806 mutex_exit(&ufs_iuniqtime_lock); 1807 } 1808 1809 /* 1810 * Update timestamps in inode. 1811 */ 1812 void 1813 ufs_itimes_nolock(struct inode *ip) 1814 { 1815 1816 /* 1817 * if noatime is set and the inode access time is the only field that 1818 * must be changed, exit immediately. 1819 */ 1820 if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) && 1821 (ip->i_ufsvfs->vfs_noatime)) { 1822 return; 1823 } 1824 1825 if (ip->i_flag & (IUPD|IACC|ICHG)) { 1826 if (ip->i_flag & ICHG) 1827 ip->i_flag |= IMOD; 1828 else 1829 ip->i_flag |= IMODACC; 1830 ufs_imark(ip); 1831 ip->i_flag &= ~(IACC|IUPD|ICHG); 1832 } 1833 } 1834