1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2016, 2017 by Delphix. All rights reserved. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/t_lock.h> 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/uio.h> 44 #include <sys/bitmap.h> 45 #include <sys/signal.h> 46 #include <sys/cred.h> 47 #include <sys/user.h> 48 #include <sys/vfs.h> 49 #include <sys/stat.h> 50 #include <sys/vnode.h> 51 #include <sys/buf.h> 52 #include <sys/proc.h> 53 #include <sys/disp.h> 54 #include <sys/dnlc.h> 55 #include <sys/mode.h> 56 #include <sys/cmn_err.h> 57 #include <sys/kstat.h> 58 #include <sys/acl.h> 59 #include <sys/var.h> 60 #include <sys/fs/ufs_inode.h> 61 #include <sys/fs/ufs_fs.h> 62 #include <sys/fs/ufs_trans.h> 63 #include <sys/fs/ufs_acl.h> 64 #include <sys/fs/ufs_bio.h> 65 #include <sys/fs/ufs_quota.h> 66 #include <sys/fs/ufs_log.h> 67 #include <vm/hat.h> 68 #include <vm/as.h> 69 #include <vm/pvn.h> 70 #include <vm/seg.h> 71 #include <sys/swap.h> 72 #include <sys/cpuvar.h> 73 #include <sys/sysmacros.h> 74 #include <sys/errno.h> 75 #include <sys/kmem.h> 76 #include <sys/debug.h> 77 #include <fs/fs_subr.h> 78 #include <sys/policy.h> 79 80 struct kmem_cache *inode_cache; /* cache of free inodes */ 81 82 /* UFS Inode Cache Stats -- Not protected */ 83 struct instats ins = { 84 { "size", KSTAT_DATA_ULONG }, 85 { "maxsize", KSTAT_DATA_ULONG }, 86 { "hits", KSTAT_DATA_ULONG }, 87 { "misses", KSTAT_DATA_ULONG }, 88 { "kmem allocs", KSTAT_DATA_ULONG }, 89 { "kmem frees", KSTAT_DATA_ULONG }, 90 { "maxsize reached", KSTAT_DATA_ULONG }, 91 { "puts at frontlist", KSTAT_DATA_ULONG }, 92 { "puts at backlist", KSTAT_DATA_ULONG }, 93 { "queues to free", KSTAT_DATA_ULONG }, 94 { "scans", KSTAT_DATA_ULONG }, 95 { "thread idles", KSTAT_DATA_ULONG }, 96 { "lookup idles", KSTAT_DATA_ULONG }, 97 { "vget idles", KSTAT_DATA_ULONG }, 98 { "cache allocs", KSTAT_DATA_ULONG }, 99 { "cache frees", KSTAT_DATA_ULONG }, 100 { "pushes at close", KSTAT_DATA_ULONG } 101 }; 102 103 /* kstat data */ 104 static kstat_t *ufs_inode_kstat = NULL; 105 106 union ihead *ihead; /* inode LRU cache, Chris Maltby */ 107 kmutex_t *ih_lock; /* protect inode cache hash table */ 108 static int ino_hashlen = 4; /* desired average hash chain length */ 109 int inohsz; /* number of buckets in the hash table */ 110 struct timeval32 iuniqtime; 111 112 kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */ 113 kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */ 114 kmutex_t ufsvfs_mutex; 115 struct ufsvfs *oldufsvfslist, *ufsvfslist; 116 117 /* 118 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no 119 * I/Os are going on. 120 */ 121 clock_t ufs_iowait; 122 123 /* 124 * the threads that process idle inodes and free (deleted) inodes 125 * have high water marks that are set in ufsinit(). 126 * These values but can be no less then the minimum shown below 127 */ 128 int ufs_idle_max; /* # of allowable idle inodes */ 129 ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */ 130 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */ 131 132 /* 133 * Tunables for ufs write throttling. 134 * These are validated in ufs_iinit() since improper settings 135 * can lead to filesystem hangs. 136 */ 137 #define UFS_HW_DEFAULT (16 * 1024 * 1024) 138 #define UFS_LW_DEFAULT (8 * 1024 * 1024) 139 int ufs_HW = UFS_HW_DEFAULT; 140 int ufs_LW = UFS_LW_DEFAULT; 141 142 static void ihinit(void); 143 extern int hash2ints(int, int); 144 145 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **, 146 struct cred *, int); 147 148 /* ARGSUSED */ 149 static int 150 ufs_inode_kstat_update(kstat_t *ksp, int rw) 151 { 152 if (rw == KSTAT_WRITE) 153 return (EACCES); 154 155 ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 156 "slab_alloc"); 157 ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 158 "slab_free"); 159 ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 160 "alloc"); 161 ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 162 "free"); 163 ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 164 "buf_inuse"); 165 ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 166 "buf_max"); 167 ins.in_misses.value.ul = ins.in_kcalloc.value.ul; 168 169 return (0); 170 } 171 172 void 173 ufs_iinit(void) 174 { 175 /* 176 * Validate that ufs_HW > ufs_LW. 177 * The default values for these two tunables have been increased. 178 * There is now a range of values for ufs_HW that used to be 179 * legal on previous Solaris versions but no longer is now. 180 * Upgrading a machine which has an /etc/system setting for ufs_HW 181 * from that range can lead to filesystem hangs unless the values 182 * are checked here. 183 */ 184 if (ufs_HW <= ufs_LW) { 185 cmn_err(CE_WARN, 186 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.", 187 ufs_HW, ufs_LW); 188 ufs_LW = UFS_LW_DEFAULT; 189 ufs_HW = UFS_HW_DEFAULT; 190 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n", 191 ufs_HW, ufs_LW); 192 } 193 194 /* 195 * Adjust the tunable `ufs_ninode' to a reasonable value 196 */ 197 if (ufs_ninode <= 0) 198 ufs_ninode = ncsize; 199 if (ufs_inode_max == 0) 200 ufs_inode_max = 201 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode)); 202 if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) { 203 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld", 204 ufs_inode_max); 205 ufs_ninode = ufs_inode_max; 206 } 207 /* 208 * Wait till third call of ufs_update to declare that no I/Os are 209 * going on. This allows deferred access times to be flushed to disk. 210 */ 211 ufs_iowait = v.v_autoup * hz * 2; 212 213 /* 214 * idle thread runs when 25% of ufs_ninode entries are on the queue 215 */ 216 if (ufs_idle_max == 0) 217 ufs_idle_max = ufs_ninode >> 2; 218 if (ufs_idle_max < UFS_IDLE_MAX) 219 ufs_idle_max = UFS_IDLE_MAX; 220 if (ufs_idle_max > ufs_ninode) 221 ufs_idle_max = ufs_ninode; 222 /* 223 * This is really a misnomer, it is ufs_queue_init 224 */ 225 ufs_thread_init(&ufs_idle_q, ufs_idle_max); 226 ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL); 227 228 /* 229 * global hlock thread 230 */ 231 ufs_thread_init(&ufs_hlock, 1); 232 ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL); 233 234 ihinit(); 235 qtinit(); 236 ins.in_maxsize.value.ul = ufs_ninode; 237 if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs", 238 KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t), 239 KSTAT_FLAG_VIRTUAL)) != NULL) { 240 ufs_inode_kstat->ks_data = (void *)&ins; 241 ufs_inode_kstat->ks_update = ufs_inode_kstat_update; 242 kstat_install(ufs_inode_kstat); 243 } 244 ufsfx_init(); /* fix-on-panic initialization */ 245 si_cache_init(); 246 ufs_directio_init(); 247 lufs_init(); 248 mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL); 249 } 250 251 /* ARGSUSED */ 252 static int 253 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags) 254 { 255 struct inode *ip = buf; 256 struct vnode *vp; 257 258 vp = ip->i_vnode = vn_alloc(kmflags); 259 if (vp == NULL) { 260 return (-1); 261 } 262 vn_setops(vp, ufs_vnodeops); 263 vp->v_data = ip; 264 265 rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL); 266 rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL); 267 mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL); 268 dnlc_dir_init(&ip->i_danchor); 269 270 cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL); 271 272 return (0); 273 } 274 275 /* ARGSUSED */ 276 static void 277 ufs_inode_cache_destructor(void *buf, void *cdrarg) 278 { 279 struct inode *ip = buf; 280 struct vnode *vp; 281 282 vp = ITOV(ip); 283 284 rw_destroy(&ip->i_rwlock); 285 rw_destroy(&ip->i_contents); 286 mutex_destroy(&ip->i_tlock); 287 if (vp->v_type == VDIR) { 288 dnlc_dir_fini(&ip->i_danchor); 289 } 290 291 cv_destroy(&ip->i_wrcv); 292 293 vn_free(vp); 294 } 295 296 /* 297 * Initialize hash links for inodes 298 * and build inode free list. 299 */ 300 void 301 ihinit(void) 302 { 303 int i; 304 union ihead *ih = ihead; 305 306 mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL); 307 308 inohsz = 1 << highbit(ufs_ninode / ino_hashlen); 309 ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP); 310 ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP); 311 312 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 313 ih->ih_head[0] = ih; 314 ih->ih_head[1] = ih; 315 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL); 316 } 317 inode_cache = kmem_cache_create("ufs_inode_cache", 318 sizeof (struct inode), 0, ufs_inode_cache_constructor, 319 ufs_inode_cache_destructor, ufs_inode_cache_reclaim, 320 NULL, NULL, 0); 321 } 322 323 /* 324 * Free an inode structure 325 */ 326 void 327 ufs_free_inode(struct inode *ip) 328 { 329 vn_invalid(ITOV(ip)); 330 kmem_cache_free(inode_cache, ip); 331 } 332 333 /* 334 * Allocate an inode structure 335 */ 336 struct inode * 337 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino) 338 { 339 struct inode *ip; 340 vnode_t *vp; 341 342 ip = kmem_cache_alloc(inode_cache, KM_SLEEP); 343 /* 344 * at this point we have a newly allocated inode 345 */ 346 ip->i_freef = ip; 347 ip->i_freeb = ip; 348 ip->i_flag = IREF; 349 ip->i_seq = 0xFF; /* Unique initial value */ 350 ip->i_dev = ufsvfsp->vfs_dev; 351 ip->i_ufsvfs = ufsvfsp; 352 ip->i_devvp = ufsvfsp->vfs_devvp; 353 ip->i_number = ino; 354 ip->i_diroff = 0; 355 ip->i_nextr = 0; 356 ip->i_map = NULL; 357 ip->i_rdev = 0; 358 ip->i_writes = 0; 359 ip->i_mode = 0; 360 ip->i_delaylen = 0; 361 ip->i_delayoff = 0; 362 ip->i_nextrio = 0; 363 ip->i_ufs_acl = NULL; 364 ip->i_cflags = 0; 365 ip->i_mapcnt = 0; 366 ip->i_dquot = NULL; 367 ip->i_cachedir = CD_ENABLED; 368 ip->i_writer = NULL; 369 370 /* 371 * the vnode for this inode was allocated by the constructor 372 */ 373 vp = ITOV(ip); 374 vn_reinit(vp); 375 if (ino == (ino_t)UFSROOTINO) 376 vp->v_flag = VROOT; 377 vp->v_vfsp = ufsvfsp->vfs_vfs; 378 vn_exists(vp); 379 return (ip); 380 } 381 382 /* 383 * Look up an inode by device, inumber. If it is in core (in the 384 * inode structure), honor the locking protocol. If it is not in 385 * core, read it in from the specified device after freeing any pages. 386 * In all cases, a pointer to a VN_HELD inode structure is returned. 387 */ 388 int 389 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr) 390 { 391 return (ufs_iget_internal(vfsp, ino, ipp, cr, 0)); 392 } 393 394 /* 395 * A version of ufs_iget which returns only allocated, linked inodes. 396 * This is appropriate for any callers who do not expect a free inode. 397 */ 398 int 399 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp, 400 struct cred *cr) 401 { 402 return (ufs_iget_internal(vfsp, ino, ipp, cr, 1)); 403 } 404 405 /* 406 * Set vnode attributes based on v_type, this should be called whenever 407 * an inode's i_mode is changed. 408 */ 409 void 410 ufs_reset_vnode(vnode_t *vp) 411 { 412 /* 413 * an old DBE hack 414 */ 415 if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) 416 vp->v_flag |= VSWAPLIKE; 417 else 418 vp->v_flag &= ~VSWAPLIKE; 419 420 /* 421 * if not swap like and it's just a regular file, we want 422 * to maintain the vnode's pages sorted by clean/modified 423 * for faster sync'ing to disk 424 */ 425 if (vp->v_type == VREG) 426 vp->v_flag |= VMODSORT; 427 else 428 vp->v_flag &= ~VMODSORT; 429 430 /* 431 * Is this an attribute hidden dir? 432 */ 433 if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR) 434 vp->v_flag |= V_XATTRDIR; 435 else 436 vp->v_flag &= ~V_XATTRDIR; 437 } 438 439 /* 440 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate' 441 * flag is used to distinguish the two; when true, we validate that the inode 442 * being retrieved looks like a linked and allocated inode. 443 */ 444 /* ARGSUSED */ 445 static int 446 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp, 447 struct cred *cr, int validate) 448 { 449 struct inode *ip, *sp; 450 union ihead *ih; 451 kmutex_t *ihm; 452 struct buf *bp; 453 struct dinode *dp; 454 struct vnode *vp; 455 extern vfs_t EIO_vfs; 456 int error; 457 int ftype; /* XXX - Remove later on */ 458 dev_t vfs_dev; 459 struct ufsvfs *ufsvfsp; 460 struct fs *fs; 461 int hno; 462 daddr_t bno; 463 ulong_t ioff; 464 465 CPU_STATS_ADD_K(sys, ufsiget, 1); 466 467 /* 468 * Lookup inode in cache. 469 */ 470 vfs_dev = vfsp->vfs_dev; 471 hno = INOHASH(ino); 472 ih = &ihead[hno]; 473 ihm = &ih_lock[hno]; 474 475 again: 476 mutex_enter(ihm); 477 for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) { 478 if (ino != ip->i_number || vfs_dev != ip->i_dev || 479 (ip->i_flag & ISTALE)) 480 continue; 481 482 /* 483 * Found the interesting inode; hold it and drop the cache lock 484 */ 485 vp = ITOV(ip); /* for locknest */ 486 VN_HOLD(vp); 487 mutex_exit(ihm); 488 rw_enter(&ip->i_contents, RW_READER); 489 490 /* 491 * if necessary, remove from idle list 492 */ 493 if ((ip->i_flag & IREF) == 0) { 494 if (ufs_rmidle(ip)) 495 VN_RELE(vp); 496 } 497 498 /* 499 * Could the inode be read from disk? 500 */ 501 if (ip->i_flag & ISTALE) { 502 rw_exit(&ip->i_contents); 503 VN_RELE(vp); 504 goto again; 505 } 506 507 ins.in_hits.value.ul++; 508 *ipp = ip; 509 510 /* 511 * Reset the vnode's attribute flags 512 */ 513 mutex_enter(&vp->v_lock); 514 ufs_reset_vnode(vp); 515 mutex_exit(&vp->v_lock); 516 517 rw_exit(&ip->i_contents); 518 519 return (0); 520 } 521 mutex_exit(ihm); 522 523 /* 524 * Inode was not in cache. 525 * 526 * Allocate a new entry 527 */ 528 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 529 fs = ufsvfsp->vfs_fs; 530 531 ip = ufs_alloc_inode(ufsvfsp, ino); 532 vp = ITOV(ip); 533 534 bno = fsbtodb(fs, itod(fs, ino)); 535 ioff = (sizeof (struct dinode)) * (itoo(fs, ino)); 536 ip->i_doff = (offset_t)ioff + ldbtob(bno); 537 538 /* 539 * put a place holder in the cache (if not already there) 540 */ 541 mutex_enter(ihm); 542 for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw) 543 if (ino == sp->i_number && vfs_dev == sp->i_dev && 544 ((sp->i_flag & ISTALE) == 0)) { 545 mutex_exit(ihm); 546 ufs_free_inode(ip); 547 goto again; 548 } 549 /* 550 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock)) 551 * here, but if we do, then shadow inode allocations panic the 552 * system. We don't have to hold vfs_dqrwlock for shadow inodes 553 * and the ufs_iget() parameters don't tell us what we are getting 554 * so we have no way of knowing this is a ufs_iget() call from 555 * a ufs_ialloc() call for a shadow inode. 556 */ 557 rw_enter(&ip->i_contents, RW_WRITER); 558 insque(ip, ih); 559 mutex_exit(ihm); 560 /* 561 * read the dinode 562 */ 563 bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize); 564 565 /* 566 * Check I/O errors 567 */ 568 error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0); 569 if (error) { 570 brelse(bp); 571 ip->i_flag |= ISTALE; /* in case someone is looking it up */ 572 rw_exit(&ip->i_contents); 573 vp->v_vfsp = &EIO_vfs; 574 VN_RELE(vp); 575 return (error); 576 } 577 /* 578 * initialize the inode's dinode 579 */ 580 dp = (struct dinode *)(ioff + bp->b_un.b_addr); 581 ip->i_ic = dp->di_ic; /* structure assignment */ 582 brelse(bp); 583 584 /* 585 * Maintain compatibility with Solaris 1.x UFS 586 */ 587 if (ip->i_suid != UID_LONG) 588 ip->i_uid = ip->i_suid; 589 if (ip->i_sgid != GID_LONG) 590 ip->i_gid = ip->i_sgid; 591 592 ftype = ip->i_mode & IFMT; 593 if (ftype == IFBLK || ftype == IFCHR) { 594 dev_t dv; 595 uint_t top16 = ip->i_ordev & 0xffff0000u; 596 597 if (top16 == 0 || top16 == 0xffff0000u) 598 dv = expdev(ip->i_ordev); 599 else 600 dv = expldev(ip->i_ordev); 601 vp->v_rdev = ip->i_rdev = dv; 602 } 603 604 /* 605 * if our caller only expects allocated inodes, verify that 606 * this inode looks good; throw it out if it's bad. 607 */ 608 if (validate) { 609 if ((ftype == 0) || (ip->i_nlink <= 0)) { 610 ip->i_flag |= ISTALE; 611 rw_exit(&ip->i_contents); 612 vp->v_vfsp = &EIO_vfs; 613 VN_RELE(vp); 614 cmn_err(CE_NOTE, 615 "%s: unexpected free inode %d, run fsck(1M)%s", 616 fs->fs_fsmnt, (int)ino, 617 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 618 return (EIO); 619 } 620 } 621 622 /* 623 * Finish initializing the vnode, special handling for shadow inodes 624 * because IFTOVT() will produce a v_type of VNON which is not what we 625 * want, set v_type to VREG explicitly in that case. 626 */ 627 if (ftype == IFSHAD) { 628 vp->v_type = VREG; 629 } else { 630 vp->v_type = IFTOVT((mode_t)ip->i_mode); 631 } 632 633 ufs_reset_vnode(vp); 634 635 /* 636 * read the shadow 637 */ 638 if (ftype != 0 && ip->i_shadow != 0) { 639 if ((error = ufs_si_load(ip, cr)) != 0) { 640 ip->i_flag |= ISTALE; 641 ip->i_ufs_acl = NULL; 642 rw_exit(&ip->i_contents); 643 vp->v_vfsp = &EIO_vfs; 644 VN_RELE(vp); 645 return (error); 646 } 647 } 648 649 /* 650 * Only attach quota information if the inode has a type and if 651 * that type is not a shadow inode. 652 */ 653 if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) && 654 ((ip->i_mode & IFMT) != IFATTRDIR)) { 655 ip->i_dquot = getinoquota(ip); 656 } 657 TRANS_MATA_IGET(ufsvfsp, ip); 658 *ipp = ip; 659 rw_exit(&ip->i_contents); 660 661 return (0); 662 } 663 664 /* 665 * Vnode is no longer referenced, write the inode out 666 * and if necessary, truncate and deallocate the file. 667 */ 668 void 669 ufs_iinactive(struct inode *ip) 670 { 671 int front; 672 struct inode *iq; 673 struct inode *hip; 674 struct ufs_q *uq; 675 struct vnode *vp = ITOV(ip); 676 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 677 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 678 679 /* 680 * Because the vnode type might have been changed, 681 * the dnlc_dir_purge must be called unconditionally. 682 */ 683 dnlc_dir_purge(&ip->i_danchor); 684 685 /* 686 * Get exclusive access to inode data. 687 */ 688 rw_enter(&ip->i_contents, RW_WRITER); 689 ASSERT(ip->i_flag & IREF); 690 691 /* 692 * Make sure no one reclaimed the inode before we put it on 693 * the freelist or destroy it. We keep our 'hold' on the vnode 694 * from vn_rele until we are ready to do something with the inode. 695 * 696 * Pageout may put a VN_HOLD/VN_RELE at anytime during this 697 * operation via an async putpage, so we must make sure 698 * we don't free/destroy the inode more than once. ufs_iget 699 * may also put a VN_HOLD on the inode before it grabs 700 * the i_contents lock. This is done so we don't free 701 * an inode that a thread is waiting on. 702 */ 703 mutex_enter(&vp->v_lock); 704 705 if (vp->v_count > 1) { 706 VN_RELE_LOCKED(vp); 707 mutex_exit(&vp->v_lock); 708 rw_exit(&ip->i_contents); 709 return; 710 } 711 mutex_exit(&vp->v_lock); 712 713 /* 714 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed 715 * and clean. It can be safely destroyed (cyf). 716 */ 717 if (ip->i_ufsvfs == NULL) { 718 rw_exit(&ip->i_contents); 719 ufs_si_del(ip); 720 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 721 ufs_free_inode(ip); 722 return; 723 } 724 725 /* 726 * queue idle inode to appropriate thread. Will check v_count == 1 727 * prior to putting this on the appropriate queue. 728 * Stale inodes will be unhashed and freed by the ufs idle thread 729 * in ufs_idle_free() 730 */ 731 front = 1; 732 if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 && 733 ip->i_mode && ip->i_nlink <= 0) { 734 /* 735 * Mark the i_flag to indicate that inode is being deleted. 736 * This flag will be cleared when the deletion is complete. 737 * This prevents nfs from sneaking in via ufs_vget() while 738 * the delete is in progress (bugid 1242481). 739 */ 740 ip->i_flag |= IDEL; 741 742 /* 743 * NOIDEL means that deletes are not allowed at this time; 744 * whoever resets NOIDEL will also send this inode back 745 * through ufs_iinactive. IREF remains set. 746 */ 747 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 748 mutex_enter(&vp->v_lock); 749 VN_RELE_LOCKED(vp); 750 mutex_exit(&vp->v_lock); 751 rw_exit(&ip->i_contents); 752 return; 753 } 754 if (!TRANS_ISTRANS(ip->i_ufsvfs)) { 755 rw_exit(&ip->i_contents); 756 ufs_delete(ip->i_ufsvfs, ip, 0); 757 return; 758 } 759 760 /* queue to delete thread; IREF remains set */ 761 ins.in_qfree.value.ul++; 762 uq = &ip->i_ufsvfs->vfs_delete; 763 764 mutex_enter(&uq->uq_mutex); 765 766 /* add to q */ 767 if ((iq = uq->uq_ihead) != 0) { 768 ip->i_freef = iq; 769 ip->i_freeb = iq->i_freeb; 770 iq->i_freeb->i_freef = ip; 771 iq->i_freeb = ip; 772 if (front) 773 uq->uq_ihead = ip; 774 } else { 775 uq->uq_ihead = ip; 776 ip->i_freef = ip; 777 ip->i_freeb = ip; 778 } 779 780 delq_info->delq_unreclaimed_files += 1; 781 delq_info->delq_unreclaimed_blocks += ip->i_blocks; 782 } else { 783 /* 784 * queue to idle thread 785 * Check the v_count == 1 again. 786 * 787 */ 788 mutex_enter(&vp->v_lock); 789 if (vp->v_count > 1) { 790 VN_RELE_LOCKED(vp); 791 mutex_exit(&vp->v_lock); 792 rw_exit(&ip->i_contents); 793 return; 794 } 795 mutex_exit(&vp->v_lock); 796 uq = &ufs_idle_q; 797 798 /* 799 * useful iff it has pages or is a fastsymlink; otherwise junk 800 */ 801 mutex_enter(&uq->uq_mutex); 802 803 /* clear IREF means `on idle list' */ 804 ip->i_flag &= ~(IREF | IDIRECTIO); 805 806 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) { 807 ins.in_frback.value.ul++; 808 hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)]; 809 ufs_nuseful_iq++; 810 } else { 811 ins.in_frfront.value.ul++; 812 hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)]; 813 ip->i_flag |= IJUNKIQ; 814 ufs_njunk_iq++; 815 } 816 ip->i_freef = hip; 817 ip->i_freeb = hip->i_freeb; 818 hip->i_freeb->i_freef = ip; 819 hip->i_freeb = ip; 820 } 821 822 /* wakeup thread(s) if q is overfull */ 823 if (++uq->uq_ne == uq->uq_lowat) 824 cv_broadcast(&uq->uq_cv); 825 826 /* all done, release the q and inode */ 827 mutex_exit(&uq->uq_mutex); 828 rw_exit(&ip->i_contents); 829 } 830 831 /* 832 * Check accessed and update flags on an inode structure. 833 * If any are on, update the inode with the (unique) current time. 834 * If waitfor is given, insure I/O order so wait for write to complete. 835 */ 836 void 837 ufs_iupdat(struct inode *ip, int waitfor) 838 { 839 struct buf *bp; 840 struct fs *fp; 841 struct dinode *dp; 842 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 843 int i; 844 int do_trans_times; 845 ushort_t flag; 846 o_uid_t suid; 847 o_gid_t sgid; 848 849 /* 850 * This function is now safe to be called with either the reader 851 * or writer i_contents lock. 852 */ 853 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 854 855 /* 856 * Return if file system has been forcibly umounted. 857 */ 858 if (ufsvfsp == NULL) 859 return; 860 861 flag = ip->i_flag; /* Atomic read */ 862 /* 863 * We better not update the disk inode from a stale inode. 864 */ 865 if (flag & ISTALE) 866 return; 867 868 fp = ip->i_fs; 869 870 if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) { 871 if (fp->fs_ronly) { 872 mutex_enter(&ip->i_tlock); 873 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 874 mutex_exit(&ip->i_tlock); 875 return; 876 } 877 /* 878 * fs is active while metadata is being written 879 */ 880 mutex_enter(&ufsvfsp->vfs_lock); 881 ufs_notclean(ufsvfsp); 882 /* 883 * get the dinode 884 */ 885 bp = UFS_BREAD(ufsvfsp, ip->i_dev, 886 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)), 887 (int)fp->fs_bsize); 888 if (bp->b_flags & B_ERROR) { 889 mutex_enter(&ip->i_tlock); 890 ip->i_flag &= 891 ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 892 mutex_exit(&ip->i_tlock); 893 brelse(bp); 894 return; 895 } 896 /* 897 * munge inode fields 898 */ 899 mutex_enter(&ip->i_tlock); 900 ITIMES_NOLOCK(ip); 901 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC); 902 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 903 mutex_exit(&ip->i_tlock); 904 905 /* 906 * For reads and concurrent re-writes, no deltas were 907 * entered for the access time changes - do it now. 908 */ 909 if (do_trans_times) { 910 TRANS_INODE_TIMES(ufsvfsp, ip); 911 } 912 913 /* 914 * For SunOS 5.0->5.4, these lines below read: 915 * 916 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 917 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 918 * 919 * where MAXUID was set to 60002. This was incorrect - 920 * the uids should have been constrained to what fitted into 921 * a 16-bit word. 922 * 923 * This means that files from 4.x filesystems that have an 924 * i_suid field larger than 60002 will have that field 925 * changed to 65535. 926 * 927 * Security note: 4.x UFS could never create a i_suid of 928 * UID_LONG since that would've corresponded to -1. 929 */ 930 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 931 UID_LONG : ip->i_uid; 932 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 933 GID_LONG : ip->i_gid; 934 935 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) { 936 ip->i_suid = suid; 937 ip->i_sgid = sgid; 938 TRANS_INODE(ufsvfsp, ip); 939 } 940 941 if ((ip->i_mode & IFMT) == IFBLK || 942 (ip->i_mode & IFMT) == IFCHR) { 943 dev_t d = ip->i_rdev; 944 dev32_t dev32; 945 946 /* 947 * load first direct block only if special device 948 */ 949 if (!cmpldev(&dev32, d)) { 950 /* 951 * We panic here because there's "no way" 952 * we should have been able to create a large 953 * inode with a large dev_t. Earlier layers 954 * should've caught this. 955 */ 956 panic("ip %p: i_rdev too big", (void *)ip); 957 } 958 959 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 960 ip->i_ordev = dev32; /* can't use old fmt. */ 961 } else { 962 ip->i_ordev = cmpdev(d); 963 } 964 } 965 966 /* 967 * copy inode to dinode (zero fastsymlnk in dinode) 968 */ 969 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number); 970 dp->di_ic = ip->i_ic; /* structure assignment */ 971 if (flag & IFASTSYMLNK) { 972 for (i = 1; i < NDADDR; i++) 973 dp->di_db[i] = 0; 974 for (i = 0; i < NIADDR; i++) 975 dp->di_ib[i] = 0; 976 } 977 if (TRANS_ISTRANS(ufsvfsp)) { 978 /* 979 * Pass only a sector size buffer containing 980 * the inode, otherwise when the buffer is copied 981 * into a cached roll buffer then too much memory 982 * gets consumed if 8KB inode buffers are passed. 983 */ 984 TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff, 985 sizeof (struct dinode), 986 (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE), 987 DEV_BSIZE); 988 989 brelse(bp); 990 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) { 991 UFS_BRWRITE(ufsvfsp, bp); 992 993 /* 994 * Synchronous write has guaranteed that inode 995 * has been written on disk so clear the flag 996 */ 997 mutex_enter(&ip->i_tlock); 998 ip->i_flag &= ~IBDWRITE; 999 mutex_exit(&ip->i_tlock); 1000 } else { 1001 bdrwrite(bp); 1002 1003 /* 1004 * This write hasn't guaranteed that inode has been 1005 * written on the disk. 1006 * Since, all updat flags on inode are cleared, we must 1007 * remember the condition in case inode is to be updated 1008 * synchronously later (e.g.- fsync()/fdatasync()) 1009 * and inode has not been modified yet. 1010 */ 1011 mutex_enter(&ip->i_tlock); 1012 ip->i_flag |= IBDWRITE; 1013 mutex_exit(&ip->i_tlock); 1014 } 1015 } else { 1016 /* 1017 * In case previous inode update was done asynchronously 1018 * (IBDWRITE) and this inode update request wants guaranteed 1019 * (synchronous) disk update, flush the inode. 1020 */ 1021 if (waitfor && (flag & IBDWRITE)) { 1022 blkflush(ip->i_dev, 1023 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number))); 1024 mutex_enter(&ip->i_tlock); 1025 ip->i_flag &= ~IBDWRITE; 1026 mutex_exit(&ip->i_tlock); 1027 } 1028 } 1029 } 1030 1031 #define SINGLE 0 /* index of single indirect block */ 1032 #define DOUBLE 1 /* index of double indirect block */ 1033 #define TRIPLE 2 /* index of triple indirect block */ 1034 1035 /* 1036 * Release blocks associated with the inode ip and 1037 * stored in the indirect block bn. Blocks are free'd 1038 * in LIFO order up to (but not including) lastbn. If 1039 * level is greater than SINGLE, the block is an indirect 1040 * block and recursive calls to indirtrunc must be used to 1041 * cleanse other indirect blocks. 1042 * 1043 * N.B.: triple indirect blocks are untested. 1044 */ 1045 static long 1046 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags) 1047 { 1048 int i; 1049 struct buf *bp, *copy; 1050 daddr32_t *bap; 1051 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1052 struct fs *fs = ufsvfsp->vfs_fs; 1053 daddr_t nb, last; 1054 long factor; 1055 int blocksreleased = 0, nblocks; 1056 1057 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 1058 /* 1059 * Calculate index in current block of last 1060 * block to be kept. -1 indicates the entire 1061 * block so we need not calculate the index. 1062 */ 1063 factor = 1; 1064 for (i = SINGLE; i < level; i++) 1065 factor *= NINDIR(fs); 1066 last = lastbn; 1067 if (lastbn > 0) 1068 last /= factor; 1069 nblocks = btodb(fs->fs_bsize); 1070 /* 1071 * Get buffer of block pointers, zero those 1072 * entries corresponding to blocks to be free'd, 1073 * and update on disk copy first. 1074 * *Unless* the root pointer has been synchronously 1075 * written to disk. If nothing points to this 1076 * indirect block then don't bother zero'ing and 1077 * writing it. 1078 */ 1079 bp = UFS_BREAD(ufsvfsp, 1080 ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize); 1081 if (bp->b_flags & B_ERROR) { 1082 brelse(bp); 1083 return (0); 1084 } 1085 bap = bp->b_un.b_daddr; 1086 if ((flags & I_CHEAP) == 0) { 1087 uint_t zb; 1088 1089 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t)); 1090 1091 if (zb) { 1092 /* 1093 * push any data into the log before we zero it 1094 */ 1095 if (bp->b_flags & B_DELWRI) 1096 TRANS_LOG(ufsvfsp, (caddr_t)bap, 1097 ldbtob(bp->b_blkno), bp->b_bcount, 1098 bp->b_un.b_addr, bp->b_bcount); 1099 copy = ngeteblk(fs->fs_bsize); 1100 bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr, 1101 (uint_t)fs->fs_bsize); 1102 bzero((caddr_t)&bap[last + 1], zb); 1103 1104 TRANS_BUF(ufsvfsp, 1105 (caddr_t)&bap[last + 1] - (caddr_t)bap, 1106 zb, bp, DT_ABZERO); 1107 1108 UFS_BRWRITE(ufsvfsp, bp); 1109 bp = copy, bap = bp->b_un.b_daddr; 1110 } 1111 } else { 1112 /* make sure write retries are also cleared */ 1113 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI); 1114 bp->b_flags |= B_STALE | B_AGE; 1115 } 1116 1117 /* 1118 * Recursively free totally unused blocks. 1119 */ 1120 flags |= I_CHEAP; 1121 for (i = NINDIR(fs) - 1; i > last; i--) { 1122 nb = bap[i]; 1123 if (nb == 0) 1124 continue; 1125 if (level > SINGLE) { 1126 blocksreleased += 1127 indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags); 1128 free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK); 1129 } else 1130 free(ip, nb, (off_t)fs->fs_bsize, flags); 1131 blocksreleased += nblocks; 1132 } 1133 flags &= ~I_CHEAP; 1134 1135 /* 1136 * Recursively free last partial block. 1137 */ 1138 if (level > SINGLE && lastbn >= 0) { 1139 last = lastbn % factor; 1140 nb = bap[i]; 1141 if (nb != 0) 1142 blocksreleased += 1143 indirtrunc(ip, nb, last, level - 1, flags); 1144 } 1145 brelse(bp); 1146 return (blocksreleased); 1147 } 1148 1149 /* 1150 * Truncate the inode ip to at most length size. 1151 * Free affected disk blocks -- the blocks of the 1152 * file are removed in reverse order. 1153 * 1154 * N.B.: triple indirect blocks are untested. 1155 */ 1156 static int i_genrand = 1234; 1157 int 1158 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr) 1159 { 1160 struct fs *fs = oip->i_fs; 1161 struct ufsvfs *ufsvfsp = oip->i_ufsvfs; 1162 struct inode *ip; 1163 daddr_t lastblock; 1164 off_t bsize; 1165 int boff; 1166 daddr_t bn, lastiblock[NIADDR]; 1167 int level; 1168 long nblocks, blocksreleased = 0; 1169 int i; 1170 ushort_t mode; 1171 struct inode tip; 1172 int err; 1173 u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ? 1174 (UFS_MAXOFFSET_T) : (MAXOFF32_T); 1175 1176 /* 1177 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most 1178 * other uses need the reader lock. opendq() holds the writer lock. 1179 */ 1180 ASSERT((oip->i_mode & IFMT) == IFSHAD || 1181 RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 1182 ASSERT(RW_WRITE_HELD(&oip->i_contents)); 1183 /* 1184 * We only allow truncation of regular files and directories 1185 * to arbitrary lengths here. In addition, we allow symbolic 1186 * links to be truncated only to zero length. Other inode 1187 * types cannot have their length set here. Disk blocks are 1188 * being dealt with - especially device inodes where 1189 * ip->i_ordev is actually being stored in ip->i_db[0]! 1190 */ 1191 TRANS_INODE(ufsvfsp, oip); 1192 mode = oip->i_mode & IFMT; 1193 if (flags & I_FREE) { 1194 i_genrand *= 16843009; /* turns into shift and adds */ 1195 i_genrand++; 1196 oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1; 1197 oip->i_flag |= ICHG |IUPD; 1198 oip->i_seq++; 1199 if (length == oip->i_size) 1200 return (0); 1201 flags |= I_CHEAP; 1202 } 1203 if (mode == IFIFO) 1204 return (0); 1205 if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR && 1206 !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD) 1207 return (EINVAL); 1208 if (length > maxoffset) 1209 return (EFBIG); 1210 if ((mode == IFDIR) || (mode == IFATTRDIR)) 1211 flags |= I_DIR; 1212 if (mode == IFSHAD) 1213 flags |= I_SHAD; 1214 if (oip == ufsvfsp->vfs_qinod) 1215 flags |= I_QUOTA; 1216 if (length == oip->i_size) { 1217 /* update ctime and mtime to please POSIX tests */ 1218 oip->i_flag |= ICHG |IUPD; 1219 oip->i_seq++; 1220 if (length == 0) { 1221 /* nothing to cache so clear the flag */ 1222 oip->i_flag &= ~IFASTSYMLNK; 1223 } 1224 return (0); 1225 } 1226 /* wipe out fast symlink till next access */ 1227 if (oip->i_flag & IFASTSYMLNK) { 1228 int j; 1229 1230 ASSERT(ITOV(oip)->v_type == VLNK); 1231 1232 oip->i_flag &= ~IFASTSYMLNK; 1233 1234 for (j = 1; j < NDADDR; j++) 1235 oip->i_db[j] = 0; 1236 for (j = 0; j < NIADDR; j++) 1237 oip->i_ib[j] = 0; 1238 } 1239 1240 boff = (int)blkoff(fs, length); 1241 1242 if (length > oip->i_size) { 1243 /* 1244 * Trunc up case. BMAPALLOC will insure that the right blocks 1245 * are allocated. This includes extending the old frag to a 1246 * full block (if needed) in addition to doing any work 1247 * needed for allocating the last block. 1248 */ 1249 if (boff == 0) 1250 err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr); 1251 else 1252 err = BMAPALLOC(oip, length - 1, boff, cr); 1253 1254 if (err == 0) { 1255 /* 1256 * Save old size and set inode's size now 1257 * so that we don't cause too much of the 1258 * file to be zero'd and pushed. 1259 */ 1260 u_offset_t osize = oip->i_size; 1261 oip->i_size = length; 1262 /* 1263 * Make sure we zero out the remaining bytes of 1264 * the page in case a mmap scribbled on it. We 1265 * can't prevent a mmap from writing beyond EOF 1266 * on the last page of a file. 1267 * 1268 */ 1269 if ((boff = (int)blkoff(fs, osize)) != 0) { 1270 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ? 1271 fs->fs_bsize : fragroundup(fs, boff); 1272 pvn_vpzero(ITOV(oip), osize, 1273 (size_t)(bsize - boff)); 1274 } 1275 oip->i_flag |= ICHG|IATTCHG; 1276 oip->i_seq++; 1277 ITIMES_NOLOCK(oip); 1278 /* 1279 * MAXOFF32_T is old 2GB size limit. If 1280 * this operation caused a large file to be 1281 * created, turn on the superblock flag 1282 * and update the superblock, if the flag 1283 * is not already on. 1284 */ 1285 if ((length > (u_offset_t)MAXOFF32_T) && 1286 !(fs->fs_flags & FSLARGEFILES)) { 1287 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1288 mutex_enter(&ufsvfsp->vfs_lock); 1289 fs->fs_flags |= FSLARGEFILES; 1290 ufs_sbwrite(ufsvfsp); 1291 mutex_exit(&ufsvfsp->vfs_lock); 1292 } 1293 } 1294 1295 return (err); 1296 } 1297 1298 /* 1299 * Update the pages of the file. If the file is not being 1300 * truncated to a block boundary, the contents of the 1301 * pages following the end of the file must be zero'ed 1302 * in case it ever become accessible again because 1303 * of subsequent file growth. 1304 */ 1305 if (boff == 0) { 1306 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage, 1307 B_INVAL | B_TRUNC, CRED()); 1308 } else { 1309 /* 1310 * Make sure that the last block is properly allocated. 1311 * We only really have to do this if the last block is 1312 * actually allocated since ufs_bmap will now handle the case 1313 * of an fragment which has no block allocated. Just to 1314 * be sure, we do it now independent of current allocation. 1315 */ 1316 err = BMAPALLOC(oip, length - 1, boff, cr); 1317 if (err) 1318 return (err); 1319 1320 /* 1321 * BMAPALLOC will call bmap_write which defers i_seq 1322 * processing. If the timestamps were changed, update 1323 * i_seq before rdip drops i_contents or syncs the inode. 1324 */ 1325 if (oip->i_flag & (ICHG|IUPD)) 1326 oip->i_seq++; 1327 1328 /* 1329 * BugId 4069932 1330 * Make sure that the relevant partial page appears in 1331 * the v_pages list, so that pvn_vpzero() will do its 1332 * job. Since doing this correctly requires everything 1333 * in rdip() except for the uiomove(), it's easier and 1334 * safer to do the uiomove() rather than duplicate the 1335 * rest of rdip() here. 1336 * 1337 * To get here, we know that length indicates a byte 1338 * that is not the first byte of a block. (length - 1) 1339 * is the last actual byte known to exist. Deduction 1340 * shows it is in the same block as byte (length). 1341 * Thus, this rdip() invocation should always succeed 1342 * except in the face of i/o errors, and give us the 1343 * block we care about. 1344 * 1345 * rdip() makes the same locking assertions and 1346 * assumptions as we do. We do not acquire any locks 1347 * before calling it, so we have not changed the locking 1348 * situation. Finally, there do not appear to be any 1349 * paths whereby rdip() ends up invoking us again. 1350 * Thus, infinite recursion is avoided. 1351 */ 1352 { 1353 uio_t uio; 1354 iovec_t iov[1]; 1355 char buffer; 1356 1357 uio.uio_iov = iov; 1358 uio.uio_iovcnt = 1; 1359 uio.uio_loffset = length - 1; 1360 uio.uio_resid = 1; 1361 uio.uio_segflg = UIO_SYSSPACE; 1362 uio.uio_extflg = UIO_COPY_CACHED; 1363 1364 iov[0].iov_base = &buffer; 1365 iov[0].iov_len = 1; 1366 1367 err = rdip(oip, &uio, UIO_READ, NULL); 1368 if (err) 1369 return (err); 1370 } 1371 1372 bsize = (int)lblkno(fs, length - 1) >= NDADDR ? 1373 fs->fs_bsize : fragroundup(fs, boff); 1374 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff)); 1375 /* 1376 * Ensure full fs block is marked as dirty. 1377 */ 1378 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff), 1379 ufs_putapage, B_INVAL | B_TRUNC, CRED()); 1380 } 1381 1382 /* 1383 * Calculate index into inode's block list of 1384 * last direct and indirect blocks (if any) 1385 * which we want to keep. Lastblock is -1 when 1386 * the file is truncated to 0. 1387 */ 1388 lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; 1389 lastiblock[SINGLE] = lastblock - NDADDR; 1390 lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); 1391 lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); 1392 nblocks = btodb(fs->fs_bsize); 1393 1394 /* 1395 * Update file and block pointers 1396 * on disk before we start freeing blocks. 1397 * If we crash before free'ing blocks below, 1398 * the blocks will be returned to the free list. 1399 * lastiblock values are also normalized to -1 1400 * for calls to indirtrunc below. 1401 */ 1402 tip = *oip; /* structure copy */ 1403 ip = &tip; 1404 1405 for (level = TRIPLE; level >= SINGLE; level--) 1406 if (lastiblock[level] < 0) { 1407 oip->i_ib[level] = 0; 1408 lastiblock[level] = -1; 1409 } 1410 for (i = NDADDR - 1; i > lastblock; i--) { 1411 oip->i_db[i] = 0; 1412 flags |= I_CHEAP; 1413 } 1414 oip->i_size = length; 1415 oip->i_flag |= ICHG|IUPD|IATTCHG; 1416 oip->i_seq++; 1417 if (!TRANS_ISTRANS(ufsvfsp)) 1418 ufs_iupdat(oip, I_SYNC); /* do sync inode update */ 1419 1420 /* 1421 * Indirect blocks first. 1422 */ 1423 for (level = TRIPLE; level >= SINGLE; level--) { 1424 bn = ip->i_ib[level]; 1425 if (bn != 0) { 1426 blocksreleased += 1427 indirtrunc(ip, bn, lastiblock[level], level, flags); 1428 if (lastiblock[level] < 0) { 1429 ip->i_ib[level] = 0; 1430 free(ip, bn, (off_t)fs->fs_bsize, 1431 flags | I_IBLK); 1432 blocksreleased += nblocks; 1433 } 1434 } 1435 if (lastiblock[level] >= 0) 1436 goto done; 1437 } 1438 1439 /* 1440 * All whole direct blocks or frags. 1441 */ 1442 for (i = NDADDR - 1; i > lastblock; i--) { 1443 bn = ip->i_db[i]; 1444 if (bn == 0) 1445 continue; 1446 ip->i_db[i] = 0; 1447 bsize = (off_t)blksize(fs, ip, i); 1448 free(ip, bn, bsize, flags); 1449 blocksreleased += btodb(bsize); 1450 } 1451 if (lastblock < 0) 1452 goto done; 1453 1454 /* 1455 * Finally, look for a change in size of the 1456 * last direct block; release any frags. 1457 */ 1458 bn = ip->i_db[lastblock]; 1459 if (bn != 0) { 1460 off_t oldspace, newspace; 1461 1462 /* 1463 * Calculate amount of space we're giving 1464 * back as old block size minus new block size. 1465 */ 1466 oldspace = blksize(fs, ip, lastblock); 1467 UFS_SET_ISIZE(length, ip); 1468 newspace = blksize(fs, ip, lastblock); 1469 if (newspace == 0) { 1470 err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0"); 1471 return (err); 1472 } 1473 if (oldspace - newspace > 0) { 1474 /* 1475 * Block number of space to be free'd is 1476 * the old block # plus the number of frags 1477 * required for the storage we're keeping. 1478 */ 1479 bn += numfrags(fs, newspace); 1480 free(ip, bn, oldspace - newspace, flags); 1481 blocksreleased += btodb(oldspace - newspace); 1482 } 1483 } 1484 done: 1485 /* BEGIN PARANOIA */ 1486 for (level = SINGLE; level <= TRIPLE; level++) 1487 if (ip->i_ib[level] != oip->i_ib[level]) { 1488 err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block"); 1489 return (err); 1490 } 1491 1492 for (i = 0; i < NDADDR; i++) 1493 if (ip->i_db[i] != oip->i_db[i]) { 1494 err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block"); 1495 return (err); 1496 } 1497 /* END PARANOIA */ 1498 oip->i_blocks -= blocksreleased; 1499 1500 if (oip->i_blocks < 0) { /* sanity */ 1501 cmn_err(CE_NOTE, 1502 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n", 1503 fs->fs_fsmnt, (int)oip->i_number, oip->i_size, 1504 (int)oip->i_blocks); 1505 oip->i_blocks = 0; 1506 } 1507 oip->i_flag |= ICHG|IATTCHG; 1508 oip->i_seq++; 1509 /* blocksreleased is >= zero, so this can not fail */ 1510 (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL, 1511 (size_t *)NULL); 1512 return (0); 1513 } 1514 1515 /* 1516 * Check mode permission on inode. Mode is READ, WRITE or EXEC. 1517 * In the case of WRITE, the read-only status of the file system 1518 * is checked. Depending on the calling user, the appropriate 1519 * mode bits are selected; privileges to override missing permission 1520 * bits are checked through secpolicy_vnode_access(). 1521 * The i_contens lock must be held as reader here to prevent racing with 1522 * the acl subsystem removing/setting/changing acls on this inode. 1523 * The caller is responsible for indicating whether or not the i_contents 1524 * lock needs to be acquired here or if already held. 1525 */ 1526 int 1527 ufs_iaccess(struct inode *ip, int mode, struct cred *cr, int dolock) 1528 { 1529 int shift = 0; 1530 int ret = 0; 1531 1532 if (dolock) 1533 rw_enter(&ip->i_contents, RW_READER); 1534 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 1535 1536 if (mode & IWRITE) { 1537 /* 1538 * Disallow write attempts on read-only 1539 * file systems, unless the file is a block 1540 * or character device or a FIFO. 1541 */ 1542 if (ip->i_fs->fs_ronly != 0) { 1543 if ((ip->i_mode & IFMT) != IFCHR && 1544 (ip->i_mode & IFMT) != IFBLK && 1545 (ip->i_mode & IFMT) != IFIFO) { 1546 ret = EROFS; 1547 goto out; 1548 } 1549 } 1550 } 1551 /* 1552 * If there is an acl, check the acl and return. 1553 */ 1554 if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) { 1555 ret = ufs_acl_access(ip, mode, cr); 1556 goto out; 1557 } 1558 1559 /* 1560 * Access check is based on only one of owner, group, public. 1561 * If not owner, then check group. 1562 * If not a member of the group, then check public access. 1563 */ 1564 if (crgetuid(cr) != ip->i_uid) { 1565 shift += 3; 1566 if (!groupmember((uid_t)ip->i_gid, cr)) 1567 shift += 3; 1568 } 1569 1570 /* test missing privilege bits */ 1571 ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid, 1572 ip->i_mode << shift, mode); 1573 out: 1574 if (dolock) 1575 rw_exit(&ip->i_contents); 1576 return (ret); 1577 } 1578 1579 /* 1580 * if necessary, remove an inode from the free list 1581 * i_contents is held except at unmount 1582 * 1583 * Return 1 if the inode is taken off of the ufs_idle_q, 1584 * and the caller is expected to call VN_RELE. 1585 * 1586 * Return 0 otherwise. 1587 */ 1588 int 1589 ufs_rmidle(struct inode *ip) 1590 { 1591 int rval = 0; 1592 1593 mutex_enter(&ip->i_tlock); 1594 if ((ip->i_flag & IREF) == 0) { 1595 mutex_enter(&ufs_idle_q.uq_mutex); 1596 ip->i_freef->i_freeb = ip->i_freeb; 1597 ip->i_freeb->i_freef = ip->i_freef; 1598 ip->i_freef = ip; 1599 ip->i_freeb = ip; 1600 ip->i_flag |= IREF; 1601 ufs_idle_q.uq_ne--; 1602 if (ip->i_flag & IJUNKIQ) { 1603 ufs_njunk_iq--; 1604 ip->i_flag &= ~IJUNKIQ; 1605 } else { 1606 ufs_nuseful_iq--; 1607 } 1608 mutex_exit(&ufs_idle_q.uq_mutex); 1609 rval = 1; 1610 } 1611 mutex_exit(&ip->i_tlock); 1612 return (rval); 1613 } 1614 1615 /* 1616 * scan the hash of inodes and call func with the inode locked 1617 */ 1618 int 1619 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg, 1620 struct ufsvfs *ufsvfsp) 1621 { 1622 struct inode *ip; /* current inode */ 1623 struct inode *lip = NULL; /* last/previous inode */ 1624 union ihead *ih; /* current hash chain */ 1625 int error, i; 1626 int saverror = 0; 1627 int lip_held; /* lip needs a VN_RELE() */ 1628 1629 /* 1630 * If ufsvfsp is NULL, then our caller should be holding 1631 * ufs_scan_lock to avoid conflicts between ufs_unmount() and 1632 * ufs_update(). Otherwise, to avoid false-positives in 1633 * ufs_unmount()'s v_count-based EBUSY check, we only hold 1634 * those inodes that are in the file system our caller cares 1635 * about. 1636 * 1637 * We know that ip is a valid inode in the hash chain (and thus 1638 * we can trust i_ufsvfs) because the inode we chained from 1639 * (lip) is still in the hash chain. This is true because either: 1640 * 1641 * 1. We did not drop the hash chain lock since the last 1642 * iteration (because we were not interested in the last inode), 1643 * or 1644 * 2. We maintained a hold on the last inode while we 1645 * we were processing it, so it could not be removed 1646 * from the hash chain. 1647 * 1648 * The whole reason we're dropping and re-grabbing the chain 1649 * lock on every inode is so that we don't present a major 1650 * choke point on throughput, particularly when we've been 1651 * called on behalf of fsflush. 1652 */ 1653 1654 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 1655 mutex_enter(&ih_lock[i]); 1656 for (ip = ih->ih_chain[0], lip_held = 0; 1657 ip != (struct inode *)ih; 1658 ip = lip->i_forw) { 1659 1660 ins.in_scan.value.ul++; 1661 1662 /* 1663 * Undo the previous iteration's VN_HOLD(), but 1664 * only if one was done. 1665 */ 1666 if (lip_held) 1667 VN_RELE(ITOV(lip)); 1668 1669 lip = ip; 1670 if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) { 1671 /* 1672 * We're not processing all inodes, and 1673 * this inode is not in the filesystem of 1674 * interest, so skip it. No need to do a 1675 * VN_HOLD() since we're not dropping the 1676 * hash chain lock until after we've 1677 * done the i_forw traversal above. 1678 */ 1679 lip_held = 0; 1680 continue; 1681 } 1682 VN_HOLD(ITOV(ip)); 1683 lip_held = 1; 1684 mutex_exit(&ih_lock[i]); 1685 1686 /* 1687 * Acquire the contents lock as writer to make 1688 * sure that the inode has been initialized in 1689 * the cache or removed from the idle list by 1690 * ufs_iget(). This works because ufs_iget() 1691 * acquires the contents lock before putting 1692 * the inode into the cache. If we can lock 1693 * it, then ufs_iget() is done with it. 1694 */ 1695 1696 if (rwtry) { 1697 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) { 1698 mutex_enter(&ih_lock[i]); 1699 continue; 1700 } 1701 } else { 1702 rw_enter(&ip->i_contents, RW_WRITER); 1703 } 1704 1705 rw_exit(&ip->i_contents); 1706 1707 /* 1708 * ISTALE means the inode couldn't be read 1709 * 1710 * We don't have to hold the i_contents lock 1711 * for this check for a couple of 1712 * reasons. First, if ISTALE is set then the 1713 * flag cannot be cleared until the inode is 1714 * removed from the cache and that cannot 1715 * happen until after we VN_RELE() it. 1716 * Second, if ISTALE is not set, then the 1717 * inode is in the cache and does not need to 1718 * be read from disk so ISTALE cannot be set 1719 * while we are not looking. 1720 */ 1721 if ((ip->i_flag & ISTALE) == 0) { 1722 if ((error = (*func)(ip, arg)) != 0) 1723 saverror = error; 1724 } 1725 1726 mutex_enter(&ih_lock[i]); 1727 } 1728 if (lip_held) 1729 VN_RELE(ITOV(lip)); 1730 mutex_exit(&ih_lock[i]); 1731 } 1732 return (saverror); 1733 } 1734 1735 /* 1736 * Mark inode with the current time, plus a unique increment. 1737 * 1738 * Since we only keep 32-bit time on disk, if UFS is still alive 1739 * beyond 2038, filesystem times will simply stick at the last 1740 * possible second of 32-bit time. Not ideal, but probably better 1741 * than going into the remote past, or confusing applications with 1742 * negative time. 1743 */ 1744 void 1745 ufs_imark(struct inode *ip) 1746 { 1747 timestruc_t now; 1748 int32_t usec, nsec; 1749 1750 /* 1751 * The update of i_seq may have been deferred, increase i_seq here 1752 * to make sure it is in sync with the timestamps. 1753 */ 1754 if (ip->i_flag & ISEQ) { 1755 ASSERT(ip->i_flag & (IUPD|ICHG)); 1756 ip->i_seq++; 1757 ip->i_flag &= ~ISEQ; 1758 } 1759 1760 gethrestime(&now); 1761 1762 /* 1763 * Fast algorithm to convert nsec to usec -- see hrt2ts() 1764 * in common/os/timers.c for a full description. 1765 */ 1766 nsec = now.tv_nsec; 1767 usec = nsec + (nsec >> 2); 1768 usec = nsec + (usec >> 1); 1769 usec = nsec + (usec >> 2); 1770 usec = nsec + (usec >> 4); 1771 usec = nsec - (usec >> 3); 1772 usec = nsec + (usec >> 2); 1773 usec = nsec + (usec >> 3); 1774 usec = nsec + (usec >> 4); 1775 usec = nsec + (usec >> 1); 1776 usec = nsec + (usec >> 6); 1777 usec = usec >> 10; 1778 1779 mutex_enter(&ufs_iuniqtime_lock); 1780 if (now.tv_sec > (time_t)iuniqtime.tv_sec || 1781 usec > iuniqtime.tv_usec) { 1782 if (now.tv_sec < TIME32_MAX) { 1783 iuniqtime.tv_sec = (time32_t)now.tv_sec; 1784 iuniqtime.tv_usec = usec; 1785 } 1786 } else { 1787 if (iuniqtime.tv_sec < TIME32_MAX) { 1788 iuniqtime.tv_usec++; 1789 /* Check for usec overflow */ 1790 if (iuniqtime.tv_usec >= MICROSEC) { 1791 iuniqtime.tv_sec++; 1792 iuniqtime.tv_usec = 0; 1793 } 1794 } 1795 } 1796 1797 if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) { 1798 ip->i_atime = iuniqtime; 1799 } 1800 if (ip->i_flag & IUPD) { 1801 ip->i_mtime = iuniqtime; 1802 ip->i_flag |= IMODTIME; 1803 } 1804 if (ip->i_flag & ICHG) { 1805 ip->i_diroff = 0; 1806 ip->i_ctime = iuniqtime; 1807 } 1808 mutex_exit(&ufs_iuniqtime_lock); 1809 } 1810 1811 /* 1812 * Update timestamps in inode. 1813 */ 1814 void 1815 ufs_itimes_nolock(struct inode *ip) 1816 { 1817 1818 /* 1819 * if noatime is set and the inode access time is the only field that 1820 * must be changed, exit immediately. 1821 */ 1822 if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) && 1823 (ip->i_ufsvfs->vfs_noatime)) { 1824 return; 1825 } 1826 1827 if (ip->i_flag & (IUPD|IACC|ICHG)) { 1828 if (ip->i_flag & ICHG) 1829 ip->i_flag |= IMOD; 1830 else 1831 ip->i_flag |= IMODACC; 1832 ufs_imark(ip); 1833 ip->i_flag &= ~(IACC|IUPD|ICHG); 1834 } 1835 } 1836