1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/uio.h> 47 #include <sys/bitmap.h> 48 #include <sys/signal.h> 49 #include <sys/cred.h> 50 #include <sys/user.h> 51 #include <sys/vfs.h> 52 #include <sys/stat.h> 53 #include <sys/vnode.h> 54 #include <sys/buf.h> 55 #include <sys/proc.h> 56 #include <sys/disp.h> 57 #include <sys/dnlc.h> 58 #include <sys/mode.h> 59 #include <sys/cmn_err.h> 60 #include <sys/kstat.h> 61 #include <sys/acl.h> 62 #include <sys/var.h> 63 #include <sys/fs/ufs_inode.h> 64 #include <sys/fs/ufs_fs.h> 65 #include <sys/fs/ufs_trans.h> 66 #include <sys/fs/ufs_acl.h> 67 #include <sys/fs/ufs_bio.h> 68 #include <sys/fs/ufs_quota.h> 69 #include <sys/fs/ufs_log.h> 70 #include <vm/hat.h> 71 #include <vm/as.h> 72 #include <vm/pvn.h> 73 #include <vm/seg.h> 74 #include <sys/swap.h> 75 #include <sys/cpuvar.h> 76 #include <sys/sysmacros.h> 77 #include <sys/errno.h> 78 #include <sys/kmem.h> 79 #include <sys/debug.h> 80 #include <fs/fs_subr.h> 81 #include <sys/policy.h> 82 83 struct kmem_cache *inode_cache; /* cache of free inodes */ 84 85 /* UFS Inode Cache Stats -- Not protected */ 86 struct instats ins = { 87 { "size", KSTAT_DATA_ULONG }, 88 { "maxsize", KSTAT_DATA_ULONG }, 89 { "hits", KSTAT_DATA_ULONG }, 90 { "misses", KSTAT_DATA_ULONG }, 91 { "kmem allocs", KSTAT_DATA_ULONG }, 92 { "kmem frees", KSTAT_DATA_ULONG }, 93 { "maxsize reached", KSTAT_DATA_ULONG }, 94 { "puts at frontlist", KSTAT_DATA_ULONG }, 95 { "puts at backlist", KSTAT_DATA_ULONG }, 96 { "queues to free", KSTAT_DATA_ULONG }, 97 { "scans", KSTAT_DATA_ULONG }, 98 { "thread idles", KSTAT_DATA_ULONG }, 99 { "lookup idles", KSTAT_DATA_ULONG }, 100 { "vget idles", KSTAT_DATA_ULONG }, 101 { "cache allocs", KSTAT_DATA_ULONG }, 102 { "cache frees", KSTAT_DATA_ULONG }, 103 { "pushes at close", KSTAT_DATA_ULONG } 104 }; 105 106 /* kstat data */ 107 static kstat_t *ufs_inode_kstat = NULL; 108 109 union ihead *ihead; /* inode LRU cache, Chris Maltby */ 110 kmutex_t *ih_lock; /* protect inode cache hash table */ 111 static int ino_hashlen = 4; /* desired average hash chain length */ 112 int inohsz; /* number of buckets in the hash table */ 113 114 kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */ 115 kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */ 116 kmutex_t ufsvfs_mutex; 117 struct ufsvfs *oldufsvfslist, *ufsvfslist; 118 119 /* 120 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no 121 * I/Os are going on. 122 */ 123 clock_t ufs_iowait; 124 125 /* 126 * the threads that process idle inodes and free (deleted) inodes 127 * have high water marks that are set in ufsinit(). 128 * These values but can be no less then the minimum shown below 129 */ 130 int ufs_idle_max; /* # of allowable idle inodes */ 131 ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */ 132 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */ 133 134 /* 135 * Tunables for ufs write throttling. 136 * These are validated in ufs_iinit() since improper settings 137 * can lead to filesystem hangs. 138 */ 139 #define UFS_HW_DEFAULT (16 * 1024 * 1024) 140 #define UFS_LW_DEFAULT (8 * 1024 * 1024) 141 int ufs_HW = UFS_HW_DEFAULT; 142 int ufs_LW = UFS_LW_DEFAULT; 143 144 static void ihinit(void); 145 extern int hash2ints(int, int); 146 147 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **, 148 struct cred *, int); 149 150 /* ARGSUSED */ 151 static int 152 ufs_inode_kstat_update(kstat_t *ksp, int rw) 153 { 154 if (rw == KSTAT_WRITE) 155 return (EACCES); 156 157 ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 158 "slab_alloc"); 159 ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 160 "slab_free"); 161 ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 162 "alloc"); 163 ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 164 "free"); 165 ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 166 "buf_inuse"); 167 ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 168 "buf_max"); 169 ins.in_misses.value.ul = ins.in_kcalloc.value.ul; 170 171 return (0); 172 } 173 174 void 175 ufs_iinit(void) 176 { 177 /* 178 * Validate that ufs_HW > ufs_LW. 179 * The default values for these two tunables have been increased. 180 * There is now a range of values for ufs_HW that used to be 181 * legal on previous Solaris versions but no longer is now. 182 * Upgrading a machine which has an /etc/system setting for ufs_HW 183 * from that range can lead to filesystem hangs unless the values 184 * are checked here. 185 */ 186 if (ufs_HW <= ufs_LW) { 187 cmn_err(CE_WARN, 188 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.", 189 ufs_HW, ufs_LW); 190 ufs_LW = UFS_LW_DEFAULT; 191 ufs_HW = UFS_HW_DEFAULT; 192 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n", 193 ufs_HW, ufs_LW); 194 } 195 196 /* 197 * Adjust the tunable `ufs_ninode' to a reasonable value 198 */ 199 if (ufs_ninode <= 0) 200 ufs_ninode = ncsize; 201 if (ufs_inode_max == 0) 202 ufs_inode_max = 203 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode)); 204 if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) { 205 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld", 206 ufs_inode_max); 207 ufs_ninode = ufs_inode_max; 208 } 209 /* 210 * Wait till third call of ufs_update to declare that no I/Os are 211 * going on. This allows deferred access times to be flushed to disk. 212 */ 213 ufs_iowait = v.v_autoup * hz * 2; 214 215 /* 216 * idle thread runs when 25% of ufs_ninode entries are on the queue 217 */ 218 if (ufs_idle_max == 0) 219 ufs_idle_max = ufs_ninode >> 2; 220 if (ufs_idle_max < UFS_IDLE_MAX) 221 ufs_idle_max = UFS_IDLE_MAX; 222 if (ufs_idle_max > ufs_ninode) 223 ufs_idle_max = ufs_ninode; 224 /* 225 * This is really a misnomer, it is ufs_queue_init 226 */ 227 ufs_thread_init(&ufs_idle_q, ufs_idle_max); 228 ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL); 229 230 /* 231 * global hlock thread 232 */ 233 ufs_thread_init(&ufs_hlock, 1); 234 ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL); 235 236 ihinit(); 237 qtinit(); 238 ins.in_maxsize.value.ul = ufs_ninode; 239 if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs", 240 KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t), 241 KSTAT_FLAG_VIRTUAL)) != NULL) { 242 ufs_inode_kstat->ks_data = (void *)&ins; 243 ufs_inode_kstat->ks_update = ufs_inode_kstat_update; 244 kstat_install(ufs_inode_kstat); 245 } 246 ufsfx_init(); /* fix-on-panic initialization */ 247 si_cache_init(); 248 ufs_directio_init(); 249 lufs_init(); 250 mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL); 251 } 252 253 /* ARGSUSED */ 254 static int 255 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags) 256 { 257 struct inode *ip = buf; 258 struct vnode *vp; 259 260 vp = ip->i_vnode = vn_alloc(kmflags); 261 if (vp == NULL) { 262 return (-1); 263 } 264 vn_setops(vp, ufs_vnodeops); 265 vp->v_data = ip; 266 267 rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL); 268 rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL); 269 mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL); 270 dnlc_dir_init(&ip->i_danchor); 271 272 cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL); 273 274 return (0); 275 } 276 277 /* ARGSUSED */ 278 static void 279 ufs_inode_cache_destructor(void *buf, void *cdrarg) 280 { 281 struct inode *ip = buf; 282 struct vnode *vp; 283 284 vp = ITOV(ip); 285 286 rw_destroy(&ip->i_rwlock); 287 rw_destroy(&ip->i_contents); 288 mutex_destroy(&ip->i_tlock); 289 if (vp->v_type == VDIR) { 290 dnlc_dir_fini(&ip->i_danchor); 291 } 292 293 cv_destroy(&ip->i_wrcv); 294 295 vn_free(vp); 296 } 297 298 /* 299 * Initialize hash links for inodes 300 * and build inode free list. 301 */ 302 void 303 ihinit(void) 304 { 305 int i; 306 union ihead *ih = ihead; 307 308 mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL); 309 310 inohsz = 1 << highbit(ufs_ninode / ino_hashlen); 311 ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP); 312 ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP); 313 314 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 315 ih->ih_head[0] = ih; 316 ih->ih_head[1] = ih; 317 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL); 318 } 319 inode_cache = kmem_cache_create("ufs_inode_cache", 320 sizeof (struct inode), 0, ufs_inode_cache_constructor, 321 ufs_inode_cache_destructor, ufs_inode_cache_reclaim, 322 NULL, NULL, 0); 323 } 324 325 /* 326 * Free an inode structure 327 */ 328 void 329 ufs_free_inode(struct inode *ip) 330 { 331 vn_invalid(ITOV(ip)); 332 kmem_cache_free(inode_cache, ip); 333 } 334 335 /* 336 * Allocate an inode structure 337 */ 338 struct inode * 339 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino) 340 { 341 struct inode *ip; 342 vnode_t *vp; 343 344 ip = kmem_cache_alloc(inode_cache, KM_SLEEP); 345 /* 346 * at this point we have a newly allocated inode 347 */ 348 ip->i_freef = ip; 349 ip->i_freeb = ip; 350 ip->i_flag = IREF; 351 ip->i_seq = 0xFF; /* Unique initial value */ 352 ip->i_dev = ufsvfsp->vfs_dev; 353 ip->i_ufsvfs = ufsvfsp; 354 ip->i_devvp = ufsvfsp->vfs_devvp; 355 ip->i_number = ino; 356 ip->i_diroff = 0; 357 ip->i_nextr = 0; 358 ip->i_map = NULL; 359 ip->i_rdev = 0; 360 ip->i_writes = 0; 361 ip->i_mode = 0; 362 ip->i_delaylen = 0; 363 ip->i_delayoff = 0; 364 ip->i_nextrio = 0; 365 ip->i_ufs_acl = NULL; 366 ip->i_cflags = 0; 367 ip->i_mapcnt = 0; 368 ip->i_dquot = NULL; 369 ip->i_cachedir = CD_ENABLED; 370 ip->i_writer = NULL; 371 372 /* 373 * the vnode for this inode was allocated by the constructor 374 */ 375 vp = ITOV(ip); 376 vn_reinit(vp); 377 if (ino == (ino_t)UFSROOTINO) 378 vp->v_flag = VROOT; 379 vp->v_vfsp = ufsvfsp->vfs_vfs; 380 vn_exists(vp); 381 return (ip); 382 } 383 384 /* 385 * Look up an inode by device, inumber. If it is in core (in the 386 * inode structure), honor the locking protocol. If it is not in 387 * core, read it in from the specified device after freeing any pages. 388 * In all cases, a pointer to a VN_HELD inode structure is returned. 389 */ 390 int 391 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr) 392 { 393 return (ufs_iget_internal(vfsp, ino, ipp, cr, 0)); 394 } 395 396 /* 397 * A version of ufs_iget which returns only allocated, linked inodes. 398 * This is appropriate for any callers who do not expect a free inode. 399 */ 400 int 401 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp, 402 struct cred *cr) 403 { 404 return (ufs_iget_internal(vfsp, ino, ipp, cr, 1)); 405 } 406 407 /* 408 * Set vnode attributes based on v_type, this should be called whenever 409 * an inode's i_mode is changed. 410 */ 411 void 412 ufs_reset_vnode(vnode_t *vp) 413 { 414 /* 415 * an old DBE hack 416 */ 417 if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) 418 vp->v_flag |= VSWAPLIKE; 419 else 420 vp->v_flag &= ~VSWAPLIKE; 421 422 /* 423 * if not swap like and it's just a regular file, we want 424 * to maintain the vnode's pages sorted by clean/modified 425 * for faster sync'ing to disk 426 */ 427 if (vp->v_type == VREG) 428 vp->v_flag |= VMODSORT; 429 else 430 vp->v_flag &= ~VMODSORT; 431 432 /* 433 * Is this an attribute hidden dir? 434 */ 435 if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR) 436 vp->v_flag |= V_XATTRDIR; 437 else 438 vp->v_flag &= ~V_XATTRDIR; 439 } 440 441 /* 442 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate' 443 * flag is used to distinguish the two; when true, we validate that the inode 444 * being retrieved looks like a linked and allocated inode. 445 */ 446 /* ARGSUSED */ 447 static int 448 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp, 449 struct cred *cr, int validate) 450 { 451 struct inode *ip, *sp; 452 union ihead *ih; 453 kmutex_t *ihm; 454 struct buf *bp; 455 struct dinode *dp; 456 struct vnode *vp; 457 extern vfs_t EIO_vfs; 458 int error; 459 int ftype; /* XXX - Remove later on */ 460 dev_t vfs_dev; 461 struct ufsvfs *ufsvfsp; 462 struct fs *fs; 463 int hno; 464 daddr_t bno; 465 ulong_t ioff; 466 467 CPU_STATS_ADD_K(sys, ufsiget, 1); 468 469 /* 470 * Lookup inode in cache. 471 */ 472 vfs_dev = vfsp->vfs_dev; 473 hno = INOHASH(ino); 474 ih = &ihead[hno]; 475 ihm = &ih_lock[hno]; 476 477 again: 478 mutex_enter(ihm); 479 for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) { 480 if (ino != ip->i_number || vfs_dev != ip->i_dev || 481 (ip->i_flag & ISTALE)) 482 continue; 483 484 /* 485 * Found the interesting inode; hold it and drop the cache lock 486 */ 487 vp = ITOV(ip); /* for locknest */ 488 VN_HOLD(vp); 489 mutex_exit(ihm); 490 rw_enter(&ip->i_contents, RW_READER); 491 492 /* 493 * if necessary, remove from idle list 494 */ 495 if ((ip->i_flag & IREF) == 0) { 496 if (ufs_rmidle(ip)) 497 VN_RELE(vp); 498 } 499 500 /* 501 * Could the inode be read from disk? 502 */ 503 if (ip->i_flag & ISTALE) { 504 rw_exit(&ip->i_contents); 505 VN_RELE(vp); 506 goto again; 507 } 508 509 ins.in_hits.value.ul++; 510 *ipp = ip; 511 512 /* 513 * Reset the vnode's attribute flags 514 */ 515 mutex_enter(&vp->v_lock); 516 ufs_reset_vnode(vp); 517 mutex_exit(&vp->v_lock); 518 519 rw_exit(&ip->i_contents); 520 521 return (0); 522 } 523 mutex_exit(ihm); 524 525 /* 526 * Inode was not in cache. 527 * 528 * Allocate a new entry 529 */ 530 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 531 fs = ufsvfsp->vfs_fs; 532 533 ip = ufs_alloc_inode(ufsvfsp, ino); 534 vp = ITOV(ip); 535 536 bno = fsbtodb(fs, itod(fs, ino)); 537 ioff = (sizeof (struct dinode)) * (itoo(fs, ino)); 538 ip->i_doff = (offset_t)ioff + ldbtob(bno); 539 540 /* 541 * put a place holder in the cache (if not already there) 542 */ 543 mutex_enter(ihm); 544 for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw) 545 if (ino == sp->i_number && vfs_dev == sp->i_dev && 546 ((sp->i_flag & ISTALE) == 0)) { 547 mutex_exit(ihm); 548 ufs_free_inode(ip); 549 goto again; 550 } 551 /* 552 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock)) 553 * here, but if we do, then shadow inode allocations panic the 554 * system. We don't have to hold vfs_dqrwlock for shadow inodes 555 * and the ufs_iget() parameters don't tell us what we are getting 556 * so we have no way of knowing this is a ufs_iget() call from 557 * a ufs_ialloc() call for a shadow inode. 558 */ 559 rw_enter(&ip->i_contents, RW_WRITER); 560 insque(ip, ih); 561 mutex_exit(ihm); 562 /* 563 * read the dinode 564 */ 565 bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize); 566 567 /* 568 * Check I/O errors 569 */ 570 error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0); 571 if (error) { 572 brelse(bp); 573 ip->i_flag |= ISTALE; /* in case someone is looking it up */ 574 rw_exit(&ip->i_contents); 575 vp->v_vfsp = &EIO_vfs; 576 VN_RELE(vp); 577 return (error); 578 } 579 /* 580 * initialize the inode's dinode 581 */ 582 dp = (struct dinode *)(ioff + bp->b_un.b_addr); 583 ip->i_ic = dp->di_ic; /* structure assignment */ 584 brelse(bp); 585 586 /* 587 * Maintain compatibility with Solaris 1.x UFS 588 */ 589 if (ip->i_suid != UID_LONG) 590 ip->i_uid = ip->i_suid; 591 if (ip->i_sgid != GID_LONG) 592 ip->i_gid = ip->i_sgid; 593 594 ftype = ip->i_mode & IFMT; 595 if (ftype == IFBLK || ftype == IFCHR) { 596 dev_t dv; 597 uint_t top16 = ip->i_ordev & 0xffff0000u; 598 599 if (top16 == 0 || top16 == 0xffff0000u) 600 dv = expdev(ip->i_ordev); 601 else 602 dv = expldev(ip->i_ordev); 603 vp->v_rdev = ip->i_rdev = dv; 604 } 605 606 /* 607 * if our caller only expects allocated inodes, verify that 608 * this inode looks good; throw it out if it's bad. 609 */ 610 if (validate) { 611 if ((ftype == 0) || (ip->i_nlink <= 0)) { 612 ip->i_flag |= ISTALE; 613 rw_exit(&ip->i_contents); 614 vp->v_vfsp = &EIO_vfs; 615 VN_RELE(vp); 616 cmn_err(CE_NOTE, 617 "%s: unexpected free inode %d, run fsck(1M)%s", 618 fs->fs_fsmnt, (int)ino, 619 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 620 return (EIO); 621 } 622 } 623 624 /* 625 * Finish initializing the vnode, special handling for shadow inodes 626 * because IFTOVT() will produce a v_type of VNON which is not what we 627 * want, set v_type to VREG explicitly in that case. 628 */ 629 if (ftype == IFSHAD) { 630 vp->v_type = VREG; 631 } else { 632 vp->v_type = IFTOVT((mode_t)ip->i_mode); 633 } 634 635 ufs_reset_vnode(vp); 636 637 /* 638 * read the shadow 639 */ 640 if (ftype != 0 && ip->i_shadow != 0) { 641 if ((error = ufs_si_load(ip, cr)) != 0) { 642 ip->i_flag |= ISTALE; 643 ip->i_ufs_acl = NULL; 644 rw_exit(&ip->i_contents); 645 vp->v_vfsp = &EIO_vfs; 646 VN_RELE(vp); 647 return (error); 648 } 649 } 650 651 /* 652 * Only attach quota information if the inode has a type and if 653 * that type is not a shadow inode. 654 */ 655 if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) && 656 ((ip->i_mode & IFMT) != IFATTRDIR)) { 657 ip->i_dquot = getinoquota(ip); 658 } 659 TRANS_MATA_IGET(ufsvfsp, ip); 660 *ipp = ip; 661 rw_exit(&ip->i_contents); 662 663 return (0); 664 } 665 666 /* 667 * Vnode is no longer referenced, write the inode out 668 * and if necessary, truncate and deallocate the file. 669 */ 670 void 671 ufs_iinactive(struct inode *ip) 672 { 673 int front; 674 struct inode *iq; 675 struct inode *hip; 676 struct ufs_q *uq; 677 struct vnode *vp = ITOV(ip); 678 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 679 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 680 681 /* 682 * Because the vnode type might have been changed, 683 * the dnlc_dir_purge must be called unconditionally. 684 */ 685 dnlc_dir_purge(&ip->i_danchor); 686 687 /* 688 * Get exclusive access to inode data. 689 */ 690 rw_enter(&ip->i_contents, RW_WRITER); 691 ASSERT(ip->i_flag & IREF); 692 693 /* 694 * Make sure no one reclaimed the inode before we put it on 695 * the freelist or destroy it. We keep our 'hold' on the vnode 696 * from vn_rele until we are ready to do something with the inode. 697 * 698 * Pageout may put a VN_HOLD/VN_RELE at anytime during this 699 * operation via an async putpage, so we must make sure 700 * we don't free/destroy the inode more than once. ufs_iget 701 * may also put a VN_HOLD on the inode before it grabs 702 * the i_contents lock. This is done so we don't free 703 * an inode that a thread is waiting on. 704 */ 705 mutex_enter(&vp->v_lock); 706 707 if (vp->v_count > 1) { 708 vp->v_count--; /* release our hold from vn_rele */ 709 mutex_exit(&vp->v_lock); 710 rw_exit(&ip->i_contents); 711 return; 712 } 713 mutex_exit(&vp->v_lock); 714 715 /* 716 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed 717 * and clean. It can be safely destroyed (cyf). 718 */ 719 if (ip->i_ufsvfs == NULL) { 720 rw_exit(&ip->i_contents); 721 ufs_si_del(ip); 722 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 723 ufs_free_inode(ip); 724 return; 725 } 726 727 /* 728 * queue idle inode to appropriate thread. Will check v_count == 1 729 * prior to putting this on the appropriate queue. 730 * Stale inodes will be unhashed and freed by the ufs idle thread 731 * in ufs_idle_free() 732 */ 733 front = 1; 734 if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 && 735 ip->i_mode && ip->i_nlink <= 0) { 736 /* 737 * Mark the i_flag to indicate that inode is being deleted. 738 * This flag will be cleared when the deletion is complete. 739 * This prevents nfs from sneaking in via ufs_vget() while 740 * the delete is in progress (bugid 1242481). 741 */ 742 ip->i_flag |= IDEL; 743 744 /* 745 * NOIDEL means that deletes are not allowed at this time; 746 * whoever resets NOIDEL will also send this inode back 747 * through ufs_iinactive. IREF remains set. 748 */ 749 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 750 mutex_enter(&vp->v_lock); 751 vp->v_count--; 752 mutex_exit(&vp->v_lock); 753 rw_exit(&ip->i_contents); 754 return; 755 } 756 if (!TRANS_ISTRANS(ip->i_ufsvfs)) { 757 rw_exit(&ip->i_contents); 758 ufs_delete(ip->i_ufsvfs, ip, 0); 759 return; 760 } 761 762 /* queue to delete thread; IREF remains set */ 763 ins.in_qfree.value.ul++; 764 uq = &ip->i_ufsvfs->vfs_delete; 765 766 mutex_enter(&uq->uq_mutex); 767 768 /* add to q */ 769 if ((iq = uq->uq_ihead) != 0) { 770 ip->i_freef = iq; 771 ip->i_freeb = iq->i_freeb; 772 iq->i_freeb->i_freef = ip; 773 iq->i_freeb = ip; 774 if (front) 775 uq->uq_ihead = ip; 776 } else { 777 uq->uq_ihead = ip; 778 ip->i_freef = ip; 779 ip->i_freeb = ip; 780 } 781 782 delq_info->delq_unreclaimed_files += 1; 783 delq_info->delq_unreclaimed_blocks += ip->i_blocks; 784 } else { 785 /* 786 * queue to idle thread 787 * Check the v_count == 1 again. 788 * 789 */ 790 mutex_enter(&vp->v_lock); 791 if (vp->v_count > 1) { 792 vp->v_count--; /* release our hold from vn_rele */ 793 mutex_exit(&vp->v_lock); 794 rw_exit(&ip->i_contents); 795 return; 796 } 797 mutex_exit(&vp->v_lock); 798 uq = &ufs_idle_q; 799 800 /* 801 * useful iff it has pages or is a fastsymlink; otherwise junk 802 */ 803 mutex_enter(&uq->uq_mutex); 804 805 /* clear IREF means `on idle list' */ 806 ip->i_flag &= ~(IREF | IDIRECTIO); 807 808 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) { 809 ins.in_frback.value.ul++; 810 hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)]; 811 ufs_nuseful_iq++; 812 } else { 813 ins.in_frfront.value.ul++; 814 hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)]; 815 ip->i_flag |= IJUNKIQ; 816 ufs_njunk_iq++; 817 } 818 ip->i_freef = hip; 819 ip->i_freeb = hip->i_freeb; 820 hip->i_freeb->i_freef = ip; 821 hip->i_freeb = ip; 822 } 823 824 /* wakeup thread(s) if q is overfull */ 825 if (++uq->uq_ne == uq->uq_lowat) 826 cv_broadcast(&uq->uq_cv); 827 828 /* all done, release the q and inode */ 829 mutex_exit(&uq->uq_mutex); 830 rw_exit(&ip->i_contents); 831 } 832 833 /* 834 * Check accessed and update flags on an inode structure. 835 * If any are on, update the inode with the (unique) current time. 836 * If waitfor is given, insure I/O order so wait for write to complete. 837 */ 838 void 839 ufs_iupdat(struct inode *ip, int waitfor) 840 { 841 struct buf *bp; 842 struct fs *fp; 843 struct dinode *dp; 844 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 845 int i; 846 int do_trans_times; 847 ushort_t flag; 848 o_uid_t suid; 849 o_gid_t sgid; 850 851 /* 852 * This function is now safe to be called with either the reader 853 * or writer i_contents lock. 854 */ 855 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 856 857 /* 858 * Return if file system has been forcibly umounted. 859 */ 860 if (ufsvfsp == NULL) 861 return; 862 863 flag = ip->i_flag; /* Atomic read */ 864 /* 865 * We better not update the disk inode from a stale inode. 866 */ 867 if (flag & ISTALE) 868 return; 869 870 fp = ip->i_fs; 871 872 if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) { 873 if (fp->fs_ronly) { 874 mutex_enter(&ip->i_tlock); 875 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 876 mutex_exit(&ip->i_tlock); 877 return; 878 } 879 /* 880 * fs is active while metadata is being written 881 */ 882 mutex_enter(&ufsvfsp->vfs_lock); 883 ufs_notclean(ufsvfsp); 884 /* 885 * get the dinode 886 */ 887 bp = UFS_BREAD(ufsvfsp, ip->i_dev, 888 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)), 889 (int)fp->fs_bsize); 890 if (bp->b_flags & B_ERROR) { 891 mutex_enter(&ip->i_tlock); 892 ip->i_flag &= 893 ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 894 mutex_exit(&ip->i_tlock); 895 brelse(bp); 896 return; 897 } 898 /* 899 * munge inode fields 900 */ 901 mutex_enter(&ip->i_tlock); 902 ITIMES_NOLOCK(ip); 903 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC); 904 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 905 mutex_exit(&ip->i_tlock); 906 907 /* 908 * For reads and concurrent re-writes, no deltas were 909 * entered for the access time changes - do it now. 910 */ 911 if (do_trans_times) { 912 TRANS_INODE_TIMES(ufsvfsp, ip); 913 } 914 915 /* 916 * For SunOS 5.0->5.4, these lines below read: 917 * 918 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 919 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 920 * 921 * where MAXUID was set to 60002. This was incorrect - 922 * the uids should have been constrained to what fitted into 923 * a 16-bit word. 924 * 925 * This means that files from 4.x filesystems that have an 926 * i_suid field larger than 60002 will have that field 927 * changed to 65535. 928 * 929 * Security note: 4.x UFS could never create a i_suid of 930 * UID_LONG since that would've corresponded to -1. 931 */ 932 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 933 UID_LONG : ip->i_uid; 934 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 935 GID_LONG : ip->i_gid; 936 937 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) { 938 ip->i_suid = suid; 939 ip->i_sgid = sgid; 940 TRANS_INODE(ufsvfsp, ip); 941 } 942 943 if ((ip->i_mode & IFMT) == IFBLK || 944 (ip->i_mode & IFMT) == IFCHR) { 945 dev_t d = ip->i_rdev; 946 dev32_t dev32; 947 948 /* 949 * load first direct block only if special device 950 */ 951 if (!cmpldev(&dev32, d)) { 952 /* 953 * We panic here because there's "no way" 954 * we should have been able to create a large 955 * inode with a large dev_t. Earlier layers 956 * should've caught this. 957 */ 958 panic("ip %p: i_rdev too big", (void *)ip); 959 } 960 961 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 962 ip->i_ordev = dev32; /* can't use old fmt. */ 963 } else { 964 ip->i_ordev = cmpdev(d); 965 } 966 } 967 968 /* 969 * copy inode to dinode (zero fastsymlnk in dinode) 970 */ 971 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number); 972 dp->di_ic = ip->i_ic; /* structure assignment */ 973 if (flag & IFASTSYMLNK) { 974 for (i = 1; i < NDADDR; i++) 975 dp->di_db[i] = 0; 976 for (i = 0; i < NIADDR; i++) 977 dp->di_ib[i] = 0; 978 } 979 if (TRANS_ISTRANS(ufsvfsp)) { 980 /* 981 * Pass only a sector size buffer containing 982 * the inode, otherwise when the buffer is copied 983 * into a cached roll buffer then too much memory 984 * gets consumed if 8KB inode buffers are passed. 985 */ 986 TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff, 987 sizeof (struct dinode), 988 (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE), 989 DEV_BSIZE); 990 991 brelse(bp); 992 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) { 993 UFS_BRWRITE(ufsvfsp, bp); 994 995 /* 996 * Synchronous write has guaranteed that inode 997 * has been written on disk so clear the flag 998 */ 999 mutex_enter(&ip->i_tlock); 1000 ip->i_flag &= ~IBDWRITE; 1001 mutex_exit(&ip->i_tlock); 1002 } else { 1003 bdrwrite(bp); 1004 1005 /* 1006 * This write hasn't guaranteed that inode has been 1007 * written on the disk. 1008 * Since, all updat flags on inode are cleared, we must 1009 * remember the condition in case inode is to be updated 1010 * synchronously later (e.g.- fsync()/fdatasync()) 1011 * and inode has not been modified yet. 1012 */ 1013 mutex_enter(&ip->i_tlock); 1014 ip->i_flag |= IBDWRITE; 1015 mutex_exit(&ip->i_tlock); 1016 } 1017 } else { 1018 /* 1019 * In case previous inode update was done asynchronously 1020 * (IBDWRITE) and this inode update request wants guaranteed 1021 * (synchronous) disk update, flush the inode. 1022 */ 1023 if (waitfor && (flag & IBDWRITE)) { 1024 blkflush(ip->i_dev, 1025 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number))); 1026 mutex_enter(&ip->i_tlock); 1027 ip->i_flag &= ~IBDWRITE; 1028 mutex_exit(&ip->i_tlock); 1029 } 1030 } 1031 } 1032 1033 #define SINGLE 0 /* index of single indirect block */ 1034 #define DOUBLE 1 /* index of double indirect block */ 1035 #define TRIPLE 2 /* index of triple indirect block */ 1036 1037 /* 1038 * Release blocks associated with the inode ip and 1039 * stored in the indirect block bn. Blocks are free'd 1040 * in LIFO order up to (but not including) lastbn. If 1041 * level is greater than SINGLE, the block is an indirect 1042 * block and recursive calls to indirtrunc must be used to 1043 * cleanse other indirect blocks. 1044 * 1045 * N.B.: triple indirect blocks are untested. 1046 */ 1047 static long 1048 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags) 1049 { 1050 int i; 1051 struct buf *bp, *copy; 1052 daddr32_t *bap; 1053 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1054 struct fs *fs = ufsvfsp->vfs_fs; 1055 daddr_t nb, last; 1056 long factor; 1057 int blocksreleased = 0, nblocks; 1058 1059 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 1060 /* 1061 * Calculate index in current block of last 1062 * block to be kept. -1 indicates the entire 1063 * block so we need not calculate the index. 1064 */ 1065 factor = 1; 1066 for (i = SINGLE; i < level; i++) 1067 factor *= NINDIR(fs); 1068 last = lastbn; 1069 if (lastbn > 0) 1070 last /= factor; 1071 nblocks = btodb(fs->fs_bsize); 1072 /* 1073 * Get buffer of block pointers, zero those 1074 * entries corresponding to blocks to be free'd, 1075 * and update on disk copy first. 1076 * *Unless* the root pointer has been synchronously 1077 * written to disk. If nothing points to this 1078 * indirect block then don't bother zero'ing and 1079 * writing it. 1080 */ 1081 bp = UFS_BREAD(ufsvfsp, 1082 ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize); 1083 if (bp->b_flags & B_ERROR) { 1084 brelse(bp); 1085 return (0); 1086 } 1087 bap = bp->b_un.b_daddr; 1088 if ((flags & I_CHEAP) == 0) { 1089 uint_t zb; 1090 1091 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t)); 1092 1093 if (zb) { 1094 /* 1095 * push any data into the log before we zero it 1096 */ 1097 if (bp->b_flags & B_DELWRI) 1098 TRANS_LOG(ufsvfsp, (caddr_t)bap, 1099 ldbtob(bp->b_blkno), bp->b_bcount, 1100 bp->b_un.b_addr, bp->b_bcount); 1101 copy = ngeteblk(fs->fs_bsize); 1102 bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr, 1103 (uint_t)fs->fs_bsize); 1104 bzero((caddr_t)&bap[last + 1], zb); 1105 1106 TRANS_BUF(ufsvfsp, 1107 (caddr_t)&bap[last + 1] - (caddr_t)bap, 1108 zb, bp, DT_ABZERO); 1109 1110 UFS_BRWRITE(ufsvfsp, bp); 1111 bp = copy, bap = bp->b_un.b_daddr; 1112 } 1113 } else { 1114 /* make sure write retries are also cleared */ 1115 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI); 1116 bp->b_flags |= B_STALE | B_AGE; 1117 } 1118 1119 /* 1120 * Recursively free totally unused blocks. 1121 */ 1122 flags |= I_CHEAP; 1123 for (i = NINDIR(fs) - 1; i > last; i--) { 1124 nb = bap[i]; 1125 if (nb == 0) 1126 continue; 1127 if (level > SINGLE) { 1128 blocksreleased += 1129 indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags); 1130 free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK); 1131 } else 1132 free(ip, nb, (off_t)fs->fs_bsize, flags); 1133 blocksreleased += nblocks; 1134 } 1135 flags &= ~I_CHEAP; 1136 1137 /* 1138 * Recursively free last partial block. 1139 */ 1140 if (level > SINGLE && lastbn >= 0) { 1141 last = lastbn % factor; 1142 nb = bap[i]; 1143 if (nb != 0) 1144 blocksreleased += 1145 indirtrunc(ip, nb, last, level - 1, flags); 1146 } 1147 brelse(bp); 1148 return (blocksreleased); 1149 } 1150 1151 /* 1152 * Truncate the inode ip to at most length size. 1153 * Free affected disk blocks -- the blocks of the 1154 * file are removed in reverse order. 1155 * 1156 * N.B.: triple indirect blocks are untested. 1157 */ 1158 static int i_genrand = 1234; 1159 int 1160 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr) 1161 { 1162 struct fs *fs = oip->i_fs; 1163 struct ufsvfs *ufsvfsp = oip->i_ufsvfs; 1164 struct inode *ip; 1165 daddr_t lastblock; 1166 off_t bsize; 1167 int boff; 1168 daddr_t bn, lastiblock[NIADDR]; 1169 int level; 1170 long nblocks, blocksreleased = 0; 1171 int i; 1172 ushort_t mode; 1173 struct inode tip; 1174 int err; 1175 u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ? 1176 (UFS_MAXOFFSET_T) : (MAXOFF32_T); 1177 1178 /* 1179 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most 1180 * other uses need the reader lock. opendq() holds the writer lock. 1181 */ 1182 ASSERT((oip->i_mode & IFMT) == IFSHAD || 1183 RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 1184 ASSERT(RW_WRITE_HELD(&oip->i_contents)); 1185 /* 1186 * We only allow truncation of regular files and directories 1187 * to arbitrary lengths here. In addition, we allow symbolic 1188 * links to be truncated only to zero length. Other inode 1189 * types cannot have their length set here. Disk blocks are 1190 * being dealt with - especially device inodes where 1191 * ip->i_ordev is actually being stored in ip->i_db[0]! 1192 */ 1193 TRANS_INODE(ufsvfsp, oip); 1194 mode = oip->i_mode & IFMT; 1195 if (flags & I_FREE) { 1196 i_genrand *= 16843009; /* turns into shift and adds */ 1197 i_genrand++; 1198 oip->i_gen += ((i_genrand + lbolt) & 0xffff) + 1; 1199 oip->i_flag |= ICHG |IUPD; 1200 oip->i_seq++; 1201 if (length == oip->i_size) 1202 return (0); 1203 flags |= I_CHEAP; 1204 } 1205 if (mode == IFIFO) 1206 return (0); 1207 if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR && 1208 !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD) 1209 return (EINVAL); 1210 if (length > maxoffset) 1211 return (EFBIG); 1212 if ((mode == IFDIR) || (mode == IFATTRDIR)) 1213 flags |= I_DIR; 1214 if (mode == IFSHAD) 1215 flags |= I_SHAD; 1216 if (oip == ufsvfsp->vfs_qinod) 1217 flags |= I_QUOTA; 1218 if (length == oip->i_size) { 1219 /* update ctime and mtime to please POSIX tests */ 1220 oip->i_flag |= ICHG |IUPD; 1221 oip->i_seq++; 1222 if (length == 0) { 1223 /* nothing to cache so clear the flag */ 1224 oip->i_flag &= ~IFASTSYMLNK; 1225 } 1226 return (0); 1227 } 1228 /* wipe out fast symlink till next access */ 1229 if (oip->i_flag & IFASTSYMLNK) { 1230 int j; 1231 1232 ASSERT(ITOV(oip)->v_type == VLNK); 1233 1234 oip->i_flag &= ~IFASTSYMLNK; 1235 1236 for (j = 1; j < NDADDR; j++) 1237 oip->i_db[j] = 0; 1238 for (j = 0; j < NIADDR; j++) 1239 oip->i_ib[j] = 0; 1240 } 1241 1242 boff = (int)blkoff(fs, length); 1243 1244 if (length > oip->i_size) { 1245 /* 1246 * Trunc up case. BMAPALLOC will insure that the right blocks 1247 * are allocated. This includes extending the old frag to a 1248 * full block (if needed) in addition to doing any work 1249 * needed for allocating the last block. 1250 */ 1251 if (boff == 0) 1252 err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr); 1253 else 1254 err = BMAPALLOC(oip, length - 1, boff, cr); 1255 1256 if (err == 0) { 1257 /* 1258 * Save old size and set inode's size now 1259 * so that we don't cause too much of the 1260 * file to be zero'd and pushed. 1261 */ 1262 u_offset_t osize = oip->i_size; 1263 oip->i_size = length; 1264 /* 1265 * Make sure we zero out the remaining bytes of 1266 * the page in case a mmap scribbled on it. We 1267 * can't prevent a mmap from writing beyond EOF 1268 * on the last page of a file. 1269 * 1270 */ 1271 if ((boff = (int)blkoff(fs, osize)) != 0) { 1272 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ? 1273 fs->fs_bsize : fragroundup(fs, boff); 1274 pvn_vpzero(ITOV(oip), osize, 1275 (size_t)(bsize - boff)); 1276 } 1277 oip->i_flag |= ICHG|IATTCHG; 1278 oip->i_seq++; 1279 ITIMES_NOLOCK(oip); 1280 /* 1281 * MAXOFF32_T is old 2GB size limit. If 1282 * this operation caused a large file to be 1283 * created, turn on the superblock flag 1284 * and update the superblock, if the flag 1285 * is not already on. 1286 */ 1287 if ((length > (u_offset_t)MAXOFF32_T) && 1288 !(fs->fs_flags & FSLARGEFILES)) { 1289 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1290 mutex_enter(&ufsvfsp->vfs_lock); 1291 fs->fs_flags |= FSLARGEFILES; 1292 ufs_sbwrite(ufsvfsp); 1293 mutex_exit(&ufsvfsp->vfs_lock); 1294 } 1295 } 1296 1297 return (err); 1298 } 1299 1300 /* 1301 * Update the pages of the file. If the file is not being 1302 * truncated to a block boundary, the contents of the 1303 * pages following the end of the file must be zero'ed 1304 * in case it ever become accessible again because 1305 * of subsequent file growth. 1306 */ 1307 if (boff == 0) { 1308 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage, 1309 B_INVAL | B_TRUNC, CRED()); 1310 } else { 1311 /* 1312 * Make sure that the last block is properly allocated. 1313 * We only really have to do this if the last block is 1314 * actually allocated since ufs_bmap will now handle the case 1315 * of an fragment which has no block allocated. Just to 1316 * be sure, we do it now independent of current allocation. 1317 */ 1318 err = BMAPALLOC(oip, length - 1, boff, cr); 1319 if (err) 1320 return (err); 1321 1322 /* 1323 * BMAPALLOC will call bmap_write which defers i_seq 1324 * processing. If the timestamps were changed, update 1325 * i_seq before rdip drops i_contents or syncs the inode. 1326 */ 1327 if (oip->i_flag & (ICHG|IUPD)) 1328 oip->i_seq++; 1329 1330 /* 1331 * BugId 4069932 1332 * Make sure that the relevant partial page appears in 1333 * the v_pages list, so that pvn_vpzero() will do its 1334 * job. Since doing this correctly requires everything 1335 * in rdip() except for the uiomove(), it's easier and 1336 * safer to do the uiomove() rather than duplicate the 1337 * rest of rdip() here. 1338 * 1339 * To get here, we know that length indicates a byte 1340 * that is not the first byte of a block. (length - 1) 1341 * is the last actual byte known to exist. Deduction 1342 * shows it is in the same block as byte (length). 1343 * Thus, this rdip() invocation should always succeed 1344 * except in the face of i/o errors, and give us the 1345 * block we care about. 1346 * 1347 * rdip() makes the same locking assertions and 1348 * assumptions as we do. We do not acquire any locks 1349 * before calling it, so we have not changed the locking 1350 * situation. Finally, there do not appear to be any 1351 * paths whereby rdip() ends up invoking us again. 1352 * Thus, infinite recursion is avoided. 1353 */ 1354 { 1355 uio_t uio; 1356 iovec_t iov[1]; 1357 char buffer; 1358 1359 uio.uio_iov = iov; 1360 uio.uio_iovcnt = 1; 1361 uio.uio_loffset = length - 1; 1362 uio.uio_resid = 1; 1363 uio.uio_segflg = UIO_SYSSPACE; 1364 uio.uio_extflg = UIO_COPY_CACHED; 1365 1366 iov[0].iov_base = &buffer; 1367 iov[0].iov_len = 1; 1368 1369 err = rdip(oip, &uio, UIO_READ, NULL); 1370 if (err) 1371 return (err); 1372 } 1373 1374 bsize = (int)lblkno(fs, length - 1) >= NDADDR ? 1375 fs->fs_bsize : fragroundup(fs, boff); 1376 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff)); 1377 /* 1378 * Ensure full fs block is marked as dirty. 1379 */ 1380 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff), 1381 ufs_putapage, B_INVAL | B_TRUNC, CRED()); 1382 } 1383 1384 /* 1385 * Calculate index into inode's block list of 1386 * last direct and indirect blocks (if any) 1387 * which we want to keep. Lastblock is -1 when 1388 * the file is truncated to 0. 1389 */ 1390 lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; 1391 lastiblock[SINGLE] = lastblock - NDADDR; 1392 lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); 1393 lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); 1394 nblocks = btodb(fs->fs_bsize); 1395 1396 /* 1397 * Update file and block pointers 1398 * on disk before we start freeing blocks. 1399 * If we crash before free'ing blocks below, 1400 * the blocks will be returned to the free list. 1401 * lastiblock values are also normalized to -1 1402 * for calls to indirtrunc below. 1403 */ 1404 tip = *oip; /* structure copy */ 1405 ip = &tip; 1406 1407 for (level = TRIPLE; level >= SINGLE; level--) 1408 if (lastiblock[level] < 0) { 1409 oip->i_ib[level] = 0; 1410 lastiblock[level] = -1; 1411 } 1412 for (i = NDADDR - 1; i > lastblock; i--) { 1413 oip->i_db[i] = 0; 1414 flags |= I_CHEAP; 1415 } 1416 oip->i_size = length; 1417 oip->i_flag |= ICHG|IUPD|IATTCHG; 1418 oip->i_seq++; 1419 if (!TRANS_ISTRANS(ufsvfsp)) 1420 ufs_iupdat(oip, I_SYNC); /* do sync inode update */ 1421 1422 /* 1423 * Indirect blocks first. 1424 */ 1425 for (level = TRIPLE; level >= SINGLE; level--) { 1426 bn = ip->i_ib[level]; 1427 if (bn != 0) { 1428 blocksreleased += 1429 indirtrunc(ip, bn, lastiblock[level], level, flags); 1430 if (lastiblock[level] < 0) { 1431 ip->i_ib[level] = 0; 1432 free(ip, bn, (off_t)fs->fs_bsize, 1433 flags | I_IBLK); 1434 blocksreleased += nblocks; 1435 } 1436 } 1437 if (lastiblock[level] >= 0) 1438 goto done; 1439 } 1440 1441 /* 1442 * All whole direct blocks or frags. 1443 */ 1444 for (i = NDADDR - 1; i > lastblock; i--) { 1445 bn = ip->i_db[i]; 1446 if (bn == 0) 1447 continue; 1448 ip->i_db[i] = 0; 1449 bsize = (off_t)blksize(fs, ip, i); 1450 free(ip, bn, bsize, flags); 1451 blocksreleased += btodb(bsize); 1452 } 1453 if (lastblock < 0) 1454 goto done; 1455 1456 /* 1457 * Finally, look for a change in size of the 1458 * last direct block; release any frags. 1459 */ 1460 bn = ip->i_db[lastblock]; 1461 if (bn != 0) { 1462 off_t oldspace, newspace; 1463 1464 /* 1465 * Calculate amount of space we're giving 1466 * back as old block size minus new block size. 1467 */ 1468 oldspace = blksize(fs, ip, lastblock); 1469 UFS_SET_ISIZE(length, ip); 1470 newspace = blksize(fs, ip, lastblock); 1471 if (newspace == 0) { 1472 err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0"); 1473 return (err); 1474 } 1475 if (oldspace - newspace > 0) { 1476 /* 1477 * Block number of space to be free'd is 1478 * the old block # plus the number of frags 1479 * required for the storage we're keeping. 1480 */ 1481 bn += numfrags(fs, newspace); 1482 free(ip, bn, oldspace - newspace, flags); 1483 blocksreleased += btodb(oldspace - newspace); 1484 } 1485 } 1486 done: 1487 /* BEGIN PARANOIA */ 1488 for (level = SINGLE; level <= TRIPLE; level++) 1489 if (ip->i_ib[level] != oip->i_ib[level]) { 1490 err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block"); 1491 return (err); 1492 } 1493 1494 for (i = 0; i < NDADDR; i++) 1495 if (ip->i_db[i] != oip->i_db[i]) { 1496 err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block"); 1497 return (err); 1498 } 1499 /* END PARANOIA */ 1500 oip->i_blocks -= blocksreleased; 1501 1502 if (oip->i_blocks < 0) { /* sanity */ 1503 cmn_err(CE_NOTE, 1504 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n", 1505 fs->fs_fsmnt, (int)oip->i_number, oip->i_size, 1506 (int)oip->i_blocks); 1507 oip->i_blocks = 0; 1508 } 1509 oip->i_flag |= ICHG|IATTCHG; 1510 oip->i_seq++; 1511 /* blocksreleased is >= zero, so this can not fail */ 1512 (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL, 1513 (size_t *)NULL); 1514 return (0); 1515 } 1516 1517 /* 1518 * Check mode permission on inode. Mode is READ, WRITE or EXEC. 1519 * In the case of WRITE, the read-only status of the file system 1520 * is checked. Depending on the calling user, the appropriate 1521 * mode bits are selected; privileges to override missing permission 1522 * bits are checked through secpolicy_vnode_access(). 1523 */ 1524 int 1525 ufs_iaccess(void *vip, int mode, struct cred *cr) 1526 { 1527 struct inode *ip = vip; 1528 int shift = 0; 1529 1530 if (mode & IWRITE) { 1531 /* 1532 * Disallow write attempts on read-only 1533 * file systems, unless the file is a block 1534 * or character device or a FIFO. 1535 */ 1536 if (ip->i_fs->fs_ronly != 0) { 1537 if ((ip->i_mode & IFMT) != IFCHR && 1538 (ip->i_mode & IFMT) != IFBLK && 1539 (ip->i_mode & IFMT) != IFIFO) { 1540 return (EROFS); 1541 } 1542 } 1543 } 1544 /* 1545 * If there is a shadow inode check for the presence of an acl, 1546 * if the acl is there use the ufs_acl_access routine to check 1547 * the acl 1548 */ 1549 if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) 1550 return (ufs_acl_access(ip, mode, cr)); 1551 1552 /* 1553 * Access check is based on only 1554 * one of owner, group, public. 1555 * If not owner, then check group. 1556 * If not a member of the group, then 1557 * check public access. 1558 */ 1559 if (crgetuid(cr) != ip->i_uid) { 1560 shift += 3; 1561 if (!groupmember((uid_t)ip->i_gid, cr)) 1562 shift += 3; 1563 } 1564 1565 mode &= ~(ip->i_mode << shift); 1566 1567 if (mode == 0) 1568 return (0); 1569 1570 /* test missing privilege bits */ 1571 return (secpolicy_vnode_access(cr, ITOV(ip), ip->i_uid, mode)); 1572 } 1573 1574 /* 1575 * if necessary, remove an inode from the free list 1576 * i_contents is held except at unmount 1577 * 1578 * Return 1 if the inode is taken off of the ufs_idle_q, 1579 * and the caller is expected to call VN_RELE. 1580 * 1581 * Return 0 otherwise. 1582 */ 1583 int 1584 ufs_rmidle(struct inode *ip) 1585 { 1586 int rval = 0; 1587 1588 mutex_enter(&ip->i_tlock); 1589 if ((ip->i_flag & IREF) == 0) { 1590 mutex_enter(&ufs_idle_q.uq_mutex); 1591 ip->i_freef->i_freeb = ip->i_freeb; 1592 ip->i_freeb->i_freef = ip->i_freef; 1593 ip->i_freef = ip; 1594 ip->i_freeb = ip; 1595 ip->i_flag |= IREF; 1596 ufs_idle_q.uq_ne--; 1597 if (ip->i_flag & IJUNKIQ) { 1598 ufs_njunk_iq--; 1599 ip->i_flag &= ~IJUNKIQ; 1600 } else { 1601 ufs_nuseful_iq--; 1602 } 1603 mutex_exit(&ufs_idle_q.uq_mutex); 1604 rval = 1; 1605 } 1606 mutex_exit(&ip->i_tlock); 1607 return (rval); 1608 } 1609 1610 /* 1611 * scan the hash of inodes and call func with the inode locked 1612 */ 1613 int 1614 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg, 1615 struct ufsvfs *ufsvfsp) 1616 { 1617 struct inode *ip; /* current inode */ 1618 struct inode *lip = NULL; /* last/previous inode */ 1619 union ihead *ih; /* current hash chain */ 1620 int error, i; 1621 int saverror = 0; 1622 int lip_held; /* lip needs a VN_RELE() */ 1623 1624 /* 1625 * If ufsvfsp is NULL, then our caller should be holding 1626 * ufs_scan_lock to avoid conflicts between ufs_unmount() and 1627 * ufs_update(). Otherwise, to avoid false-positives in 1628 * ufs_unmount()'s v_count-based EBUSY check, we only hold 1629 * those inodes that are in the file system our caller cares 1630 * about. 1631 * 1632 * We know that ip is a valid inode in the hash chain (and thus 1633 * we can trust i_ufsvfs) because the inode we chained from 1634 * (lip) is still in the hash chain. This is true because either: 1635 * 1636 * 1. We did not drop the hash chain lock since the last 1637 * iteration (because we were not interested in the last inode), 1638 * or 1639 * 2. We maintained a hold on the last inode while we 1640 * we were processing it, so it could not be removed 1641 * from the hash chain. 1642 * 1643 * The whole reason we're dropping and re-grabbing the chain 1644 * lock on every inode is so that we don't present a major 1645 * choke point on throughput, particularly when we've been 1646 * called on behalf of fsflush. 1647 */ 1648 1649 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 1650 mutex_enter(&ih_lock[i]); 1651 for (ip = ih->ih_chain[0], lip_held = 0; 1652 ip != (struct inode *)ih; 1653 ip = lip->i_forw) { 1654 1655 ins.in_scan.value.ul++; 1656 1657 /* 1658 * Undo the previous iteration's VN_HOLD(), but 1659 * only if one was done. 1660 */ 1661 if (lip_held) 1662 VN_RELE(ITOV(lip)); 1663 1664 lip = ip; 1665 if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) { 1666 /* 1667 * We're not processing all inodes, and 1668 * this inode is not in the filesystem of 1669 * interest, so skip it. No need to do a 1670 * VN_HOLD() since we're not dropping the 1671 * hash chain lock until after we've 1672 * done the i_forw traversal above. 1673 */ 1674 lip_held = 0; 1675 continue; 1676 } 1677 VN_HOLD(ITOV(ip)); 1678 lip_held = 1; 1679 mutex_exit(&ih_lock[i]); 1680 1681 /* 1682 * Acquire the contents lock as writer to make 1683 * sure that the inode has been initialized in 1684 * the cache or removed from the idle list by 1685 * ufs_iget(). This works because ufs_iget() 1686 * acquires the contents lock before putting 1687 * the inode into the cache. If we can lock 1688 * it, then he's done with it. 1689 */ 1690 1691 if (rwtry) { 1692 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) { 1693 mutex_enter(&ih_lock[i]); 1694 continue; 1695 } 1696 } else { 1697 rw_enter(&ip->i_contents, RW_WRITER); 1698 } 1699 1700 rw_exit(&ip->i_contents); 1701 1702 /* 1703 * ISTALE means the inode couldn't be read 1704 * 1705 * We don't have to hold the i_contents lock 1706 * for this check for a couple of 1707 * reasons. First, if ISTALE is set then the 1708 * flag cannot be cleared until the inode is 1709 * removed from the cache and that cannot 1710 * happen until after we VN_RELE() it. 1711 * Second, if ISTALE is not set, then the 1712 * inode is in the cache and does not need to 1713 * be read from disk so ISTALE cannot be set 1714 * while we are not looking. 1715 */ 1716 if ((ip->i_flag & ISTALE) == 0) { 1717 if ((error = (*func)(ip, arg)) != 0) 1718 saverror = error; 1719 } 1720 1721 mutex_enter(&ih_lock[i]); 1722 } 1723 if (lip_held) 1724 VN_RELE(ITOV(lip)); 1725 mutex_exit(&ih_lock[i]); 1726 } 1727 return (saverror); 1728 } 1729 1730 /* 1731 * Mark inode with the current time, plus a unique increment. 1732 * 1733 * Since we only keep 32-bit time on disk, if UFS is still alive 1734 * beyond 2038, filesystem times will simply stick at the last 1735 * possible second of 32-bit time. Not ideal, but probably better 1736 * than going into the remote past, or confusing applications with 1737 * negative time. 1738 */ 1739 void 1740 ufs_imark(struct inode *ip) 1741 { 1742 timestruc_t now; 1743 int32_t usec, nsec; 1744 1745 /* 1746 * The update of i_seq may have been deferred, increase i_seq here 1747 * to make sure it is in sync with the timestamps. 1748 */ 1749 if (ip->i_flag & ISEQ) { 1750 ASSERT(ip->i_flag & (IUPD|ICHG)); 1751 ip->i_seq++; 1752 ip->i_flag &= ~ISEQ; 1753 } 1754 1755 gethrestime(&now); 1756 1757 /* 1758 * Fast algorithm to convert nsec to usec -- see hrt2ts() 1759 * in common/os/timers.c for a full description. 1760 */ 1761 nsec = now.tv_nsec; 1762 usec = nsec + (nsec >> 2); 1763 usec = nsec + (usec >> 1); 1764 usec = nsec + (usec >> 2); 1765 usec = nsec + (usec >> 4); 1766 usec = nsec - (usec >> 3); 1767 usec = nsec + (usec >> 2); 1768 usec = nsec + (usec >> 3); 1769 usec = nsec + (usec >> 4); 1770 usec = nsec + (usec >> 1); 1771 usec = nsec + (usec >> 6); 1772 usec = usec >> 10; 1773 1774 mutex_enter(&ufs_iuniqtime_lock); 1775 if (now.tv_sec > (time_t)iuniqtime.tv_sec || 1776 usec > iuniqtime.tv_usec) { 1777 if (now.tv_sec < TIME32_MAX) { 1778 iuniqtime.tv_sec = (time32_t)now.tv_sec; 1779 iuniqtime.tv_usec = usec; 1780 } 1781 } else { 1782 if (iuniqtime.tv_sec < TIME32_MAX) { 1783 iuniqtime.tv_usec++; 1784 /* Check for usec overflow */ 1785 if (iuniqtime.tv_usec >= MICROSEC) { 1786 iuniqtime.tv_sec++; 1787 iuniqtime.tv_usec = 0; 1788 } 1789 } 1790 } 1791 1792 if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) { 1793 ip->i_atime = iuniqtime; 1794 } 1795 if (ip->i_flag & IUPD) { 1796 ip->i_mtime = iuniqtime; 1797 ip->i_flag |= IMODTIME; 1798 } 1799 if (ip->i_flag & ICHG) { 1800 ip->i_diroff = 0; 1801 ip->i_ctime = iuniqtime; 1802 } 1803 mutex_exit(&ufs_iuniqtime_lock); 1804 } 1805 1806 /* 1807 * Update timestamps in inode. 1808 */ 1809 void 1810 ufs_itimes_nolock(struct inode *ip) 1811 { 1812 1813 /* 1814 * if noatime is set and the inode access time is the only field that 1815 * must be changed, exit immediately. 1816 */ 1817 if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) && 1818 (ip->i_ufsvfs->vfs_noatime)) { 1819 return; 1820 } 1821 1822 if (ip->i_flag & (IUPD|IACC|ICHG)) { 1823 if (ip->i_flag & ICHG) 1824 ip->i_flag |= IMOD; 1825 else 1826 ip->i_flag |= IMODACC; 1827 ufs_imark(ip); 1828 ip->i_flag &= ~(IACC|IUPD|ICHG); 1829 } 1830 } 1831