1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2016, 2017 by Delphix. All rights reserved. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #include <sys/types.h> 40 #include <sys/t_lock.h> 41 #include <sys/param.h> 42 #include <sys/systm.h> 43 #include <sys/uio.h> 44 #include <sys/bitmap.h> 45 #include <sys/signal.h> 46 #include <sys/cred.h> 47 #include <sys/user.h> 48 #include <sys/vfs.h> 49 #include <sys/stat.h> 50 #include <sys/vnode.h> 51 #include <sys/buf.h> 52 #include <sys/proc.h> 53 #include <sys/disp.h> 54 #include <sys/dnlc.h> 55 #include <sys/mode.h> 56 #include <sys/cmn_err.h> 57 #include <sys/kstat.h> 58 #include <sys/acl.h> 59 #include <sys/var.h> 60 #include <sys/fs/ufs_inode.h> 61 #include <sys/fs/ufs_fs.h> 62 #include <sys/fs/ufs_trans.h> 63 #include <sys/fs/ufs_acl.h> 64 #include <sys/fs/ufs_bio.h> 65 #include <sys/fs/ufs_quota.h> 66 #include <sys/fs/ufs_log.h> 67 #include <vm/hat.h> 68 #include <vm/as.h> 69 #include <vm/pvn.h> 70 #include <vm/seg.h> 71 #include <sys/swap.h> 72 #include <sys/cpuvar.h> 73 #include <sys/sysmacros.h> 74 #include <sys/errno.h> 75 #include <sys/kmem.h> 76 #include <sys/debug.h> 77 #include <fs/fs_subr.h> 78 #include <sys/policy.h> 79 80 struct kmem_cache *inode_cache; /* cache of free inodes */ 81 82 /* UFS Inode Cache Stats -- Not protected */ 83 struct instats ins = { 84 { "size", KSTAT_DATA_ULONG }, 85 { "maxsize", KSTAT_DATA_ULONG }, 86 { "hits", KSTAT_DATA_ULONG }, 87 { "misses", KSTAT_DATA_ULONG }, 88 { "kmem allocs", KSTAT_DATA_ULONG }, 89 { "kmem frees", KSTAT_DATA_ULONG }, 90 { "maxsize reached", KSTAT_DATA_ULONG }, 91 { "puts at frontlist", KSTAT_DATA_ULONG }, 92 { "puts at backlist", KSTAT_DATA_ULONG }, 93 { "queues to free", KSTAT_DATA_ULONG }, 94 { "scans", KSTAT_DATA_ULONG }, 95 { "thread idles", KSTAT_DATA_ULONG }, 96 { "lookup idles", KSTAT_DATA_ULONG }, 97 { "vget idles", KSTAT_DATA_ULONG }, 98 { "cache allocs", KSTAT_DATA_ULONG }, 99 { "cache frees", KSTAT_DATA_ULONG }, 100 { "pushes at close", KSTAT_DATA_ULONG } 101 }; 102 103 /* kstat data */ 104 static kstat_t *ufs_inode_kstat = NULL; 105 106 union ihead *ihead; /* inode LRU cache, Chris Maltby */ 107 kmutex_t *ih_lock; /* protect inode cache hash table */ 108 static int ino_hashlen = 4; /* desired average hash chain length */ 109 int inohsz; /* number of buckets in the hash table */ 110 111 kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */ 112 kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */ 113 kmutex_t ufsvfs_mutex; 114 struct ufsvfs *oldufsvfslist, *ufsvfslist; 115 116 /* 117 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no 118 * I/Os are going on. 119 */ 120 clock_t ufs_iowait; 121 122 /* 123 * the threads that process idle inodes and free (deleted) inodes 124 * have high water marks that are set in ufsinit(). 125 * These values but can be no less then the minimum shown below 126 */ 127 int ufs_idle_max; /* # of allowable idle inodes */ 128 ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */ 129 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */ 130 131 /* 132 * Tunables for ufs write throttling. 133 * These are validated in ufs_iinit() since improper settings 134 * can lead to filesystem hangs. 135 */ 136 #define UFS_HW_DEFAULT (16 * 1024 * 1024) 137 #define UFS_LW_DEFAULT (8 * 1024 * 1024) 138 int ufs_HW = UFS_HW_DEFAULT; 139 int ufs_LW = UFS_LW_DEFAULT; 140 141 static void ihinit(void); 142 extern int hash2ints(int, int); 143 144 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **, 145 struct cred *, int); 146 147 /* ARGSUSED */ 148 static int 149 ufs_inode_kstat_update(kstat_t *ksp, int rw) 150 { 151 if (rw == KSTAT_WRITE) 152 return (EACCES); 153 154 ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 155 "slab_alloc"); 156 ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 157 "slab_free"); 158 ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 159 "alloc"); 160 ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 161 "free"); 162 ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 163 "buf_inuse"); 164 ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 165 "buf_max"); 166 ins.in_misses.value.ul = ins.in_kcalloc.value.ul; 167 168 return (0); 169 } 170 171 void 172 ufs_iinit(void) 173 { 174 /* 175 * Validate that ufs_HW > ufs_LW. 176 * The default values for these two tunables have been increased. 177 * There is now a range of values for ufs_HW that used to be 178 * legal on previous Solaris versions but no longer is now. 179 * Upgrading a machine which has an /etc/system setting for ufs_HW 180 * from that range can lead to filesystem hangs unless the values 181 * are checked here. 182 */ 183 if (ufs_HW <= ufs_LW) { 184 cmn_err(CE_WARN, 185 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.", 186 ufs_HW, ufs_LW); 187 ufs_LW = UFS_LW_DEFAULT; 188 ufs_HW = UFS_HW_DEFAULT; 189 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n", 190 ufs_HW, ufs_LW); 191 } 192 193 /* 194 * Adjust the tunable `ufs_ninode' to a reasonable value 195 */ 196 if (ufs_ninode <= 0) 197 ufs_ninode = ncsize; 198 if (ufs_inode_max == 0) 199 ufs_inode_max = 200 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode)); 201 if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) { 202 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld", 203 ufs_inode_max); 204 ufs_ninode = ufs_inode_max; 205 } 206 /* 207 * Wait till third call of ufs_update to declare that no I/Os are 208 * going on. This allows deferred access times to be flushed to disk. 209 */ 210 ufs_iowait = v.v_autoup * hz * 2; 211 212 /* 213 * idle thread runs when 25% of ufs_ninode entries are on the queue 214 */ 215 if (ufs_idle_max == 0) 216 ufs_idle_max = ufs_ninode >> 2; 217 if (ufs_idle_max < UFS_IDLE_MAX) 218 ufs_idle_max = UFS_IDLE_MAX; 219 if (ufs_idle_max > ufs_ninode) 220 ufs_idle_max = ufs_ninode; 221 /* 222 * This is really a misnomer, it is ufs_queue_init 223 */ 224 ufs_thread_init(&ufs_idle_q, ufs_idle_max); 225 ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL); 226 227 /* 228 * global hlock thread 229 */ 230 ufs_thread_init(&ufs_hlock, 1); 231 ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL); 232 233 ihinit(); 234 qtinit(); 235 ins.in_maxsize.value.ul = ufs_ninode; 236 if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs", 237 KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t), 238 KSTAT_FLAG_VIRTUAL)) != NULL) { 239 ufs_inode_kstat->ks_data = (void *)&ins; 240 ufs_inode_kstat->ks_update = ufs_inode_kstat_update; 241 kstat_install(ufs_inode_kstat); 242 } 243 ufsfx_init(); /* fix-on-panic initialization */ 244 si_cache_init(); 245 ufs_directio_init(); 246 lufs_init(); 247 mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL); 248 } 249 250 /* ARGSUSED */ 251 static int 252 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags) 253 { 254 struct inode *ip = buf; 255 struct vnode *vp; 256 257 vp = ip->i_vnode = vn_alloc(kmflags); 258 if (vp == NULL) { 259 return (-1); 260 } 261 vn_setops(vp, ufs_vnodeops); 262 vp->v_data = ip; 263 264 rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL); 265 rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL); 266 mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL); 267 dnlc_dir_init(&ip->i_danchor); 268 269 cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL); 270 271 return (0); 272 } 273 274 /* ARGSUSED */ 275 static void 276 ufs_inode_cache_destructor(void *buf, void *cdrarg) 277 { 278 struct inode *ip = buf; 279 struct vnode *vp; 280 281 vp = ITOV(ip); 282 283 rw_destroy(&ip->i_rwlock); 284 rw_destroy(&ip->i_contents); 285 mutex_destroy(&ip->i_tlock); 286 if (vp->v_type == VDIR) { 287 dnlc_dir_fini(&ip->i_danchor); 288 } 289 290 cv_destroy(&ip->i_wrcv); 291 292 vn_free(vp); 293 } 294 295 /* 296 * Initialize hash links for inodes 297 * and build inode free list. 298 */ 299 void 300 ihinit(void) 301 { 302 int i; 303 union ihead *ih = ihead; 304 305 mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL); 306 307 inohsz = 1 << highbit(ufs_ninode / ino_hashlen); 308 ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP); 309 ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP); 310 311 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 312 ih->ih_head[0] = ih; 313 ih->ih_head[1] = ih; 314 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL); 315 } 316 inode_cache = kmem_cache_create("ufs_inode_cache", 317 sizeof (struct inode), 0, ufs_inode_cache_constructor, 318 ufs_inode_cache_destructor, ufs_inode_cache_reclaim, 319 NULL, NULL, 0); 320 } 321 322 /* 323 * Free an inode structure 324 */ 325 void 326 ufs_free_inode(struct inode *ip) 327 { 328 vn_invalid(ITOV(ip)); 329 kmem_cache_free(inode_cache, ip); 330 } 331 332 /* 333 * Allocate an inode structure 334 */ 335 struct inode * 336 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino) 337 { 338 struct inode *ip; 339 vnode_t *vp; 340 341 ip = kmem_cache_alloc(inode_cache, KM_SLEEP); 342 /* 343 * at this point we have a newly allocated inode 344 */ 345 ip->i_freef = ip; 346 ip->i_freeb = ip; 347 ip->i_flag = IREF; 348 ip->i_seq = 0xFF; /* Unique initial value */ 349 ip->i_dev = ufsvfsp->vfs_dev; 350 ip->i_ufsvfs = ufsvfsp; 351 ip->i_devvp = ufsvfsp->vfs_devvp; 352 ip->i_number = ino; 353 ip->i_diroff = 0; 354 ip->i_nextr = 0; 355 ip->i_map = NULL; 356 ip->i_rdev = 0; 357 ip->i_writes = 0; 358 ip->i_mode = 0; 359 ip->i_delaylen = 0; 360 ip->i_delayoff = 0; 361 ip->i_nextrio = 0; 362 ip->i_ufs_acl = NULL; 363 ip->i_cflags = 0; 364 ip->i_mapcnt = 0; 365 ip->i_dquot = NULL; 366 ip->i_cachedir = CD_ENABLED; 367 ip->i_writer = NULL; 368 369 /* 370 * the vnode for this inode was allocated by the constructor 371 */ 372 vp = ITOV(ip); 373 vn_reinit(vp); 374 if (ino == (ino_t)UFSROOTINO) 375 vp->v_flag = VROOT; 376 vp->v_vfsp = ufsvfsp->vfs_vfs; 377 vn_exists(vp); 378 return (ip); 379 } 380 381 /* 382 * Look up an inode by device, inumber. If it is in core (in the 383 * inode structure), honor the locking protocol. If it is not in 384 * core, read it in from the specified device after freeing any pages. 385 * In all cases, a pointer to a VN_HELD inode structure is returned. 386 */ 387 int 388 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr) 389 { 390 return (ufs_iget_internal(vfsp, ino, ipp, cr, 0)); 391 } 392 393 /* 394 * A version of ufs_iget which returns only allocated, linked inodes. 395 * This is appropriate for any callers who do not expect a free inode. 396 */ 397 int 398 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp, 399 struct cred *cr) 400 { 401 return (ufs_iget_internal(vfsp, ino, ipp, cr, 1)); 402 } 403 404 /* 405 * Set vnode attributes based on v_type, this should be called whenever 406 * an inode's i_mode is changed. 407 */ 408 void 409 ufs_reset_vnode(vnode_t *vp) 410 { 411 /* 412 * an old DBE hack 413 */ 414 if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) 415 vp->v_flag |= VSWAPLIKE; 416 else 417 vp->v_flag &= ~VSWAPLIKE; 418 419 /* 420 * if not swap like and it's just a regular file, we want 421 * to maintain the vnode's pages sorted by clean/modified 422 * for faster sync'ing to disk 423 */ 424 if (vp->v_type == VREG) 425 vp->v_flag |= VMODSORT; 426 else 427 vp->v_flag &= ~VMODSORT; 428 429 /* 430 * Is this an attribute hidden dir? 431 */ 432 if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR) 433 vp->v_flag |= V_XATTRDIR; 434 else 435 vp->v_flag &= ~V_XATTRDIR; 436 } 437 438 /* 439 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate' 440 * flag is used to distinguish the two; when true, we validate that the inode 441 * being retrieved looks like a linked and allocated inode. 442 */ 443 /* ARGSUSED */ 444 static int 445 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp, 446 struct cred *cr, int validate) 447 { 448 struct inode *ip, *sp; 449 union ihead *ih; 450 kmutex_t *ihm; 451 struct buf *bp; 452 struct dinode *dp; 453 struct vnode *vp; 454 extern vfs_t EIO_vfs; 455 int error; 456 int ftype; /* XXX - Remove later on */ 457 dev_t vfs_dev; 458 struct ufsvfs *ufsvfsp; 459 struct fs *fs; 460 int hno; 461 daddr_t bno; 462 ulong_t ioff; 463 464 CPU_STATS_ADD_K(sys, ufsiget, 1); 465 466 /* 467 * Lookup inode in cache. 468 */ 469 vfs_dev = vfsp->vfs_dev; 470 hno = INOHASH(ino); 471 ih = &ihead[hno]; 472 ihm = &ih_lock[hno]; 473 474 again: 475 mutex_enter(ihm); 476 for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) { 477 if (ino != ip->i_number || vfs_dev != ip->i_dev || 478 (ip->i_flag & ISTALE)) 479 continue; 480 481 /* 482 * Found the interesting inode; hold it and drop the cache lock 483 */ 484 vp = ITOV(ip); /* for locknest */ 485 VN_HOLD(vp); 486 mutex_exit(ihm); 487 rw_enter(&ip->i_contents, RW_READER); 488 489 /* 490 * if necessary, remove from idle list 491 */ 492 if ((ip->i_flag & IREF) == 0) { 493 if (ufs_rmidle(ip)) 494 VN_RELE(vp); 495 } 496 497 /* 498 * Could the inode be read from disk? 499 */ 500 if (ip->i_flag & ISTALE) { 501 rw_exit(&ip->i_contents); 502 VN_RELE(vp); 503 goto again; 504 } 505 506 ins.in_hits.value.ul++; 507 *ipp = ip; 508 509 /* 510 * Reset the vnode's attribute flags 511 */ 512 mutex_enter(&vp->v_lock); 513 ufs_reset_vnode(vp); 514 mutex_exit(&vp->v_lock); 515 516 rw_exit(&ip->i_contents); 517 518 return (0); 519 } 520 mutex_exit(ihm); 521 522 /* 523 * Inode was not in cache. 524 * 525 * Allocate a new entry 526 */ 527 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 528 fs = ufsvfsp->vfs_fs; 529 530 ip = ufs_alloc_inode(ufsvfsp, ino); 531 vp = ITOV(ip); 532 533 bno = fsbtodb(fs, itod(fs, ino)); 534 ioff = (sizeof (struct dinode)) * (itoo(fs, ino)); 535 ip->i_doff = (offset_t)ioff + ldbtob(bno); 536 537 /* 538 * put a place holder in the cache (if not already there) 539 */ 540 mutex_enter(ihm); 541 for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw) 542 if (ino == sp->i_number && vfs_dev == sp->i_dev && 543 ((sp->i_flag & ISTALE) == 0)) { 544 mutex_exit(ihm); 545 ufs_free_inode(ip); 546 goto again; 547 } 548 /* 549 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock)) 550 * here, but if we do, then shadow inode allocations panic the 551 * system. We don't have to hold vfs_dqrwlock for shadow inodes 552 * and the ufs_iget() parameters don't tell us what we are getting 553 * so we have no way of knowing this is a ufs_iget() call from 554 * a ufs_ialloc() call for a shadow inode. 555 */ 556 rw_enter(&ip->i_contents, RW_WRITER); 557 insque(ip, ih); 558 mutex_exit(ihm); 559 /* 560 * read the dinode 561 */ 562 bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize); 563 564 /* 565 * Check I/O errors 566 */ 567 error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0); 568 if (error) { 569 brelse(bp); 570 ip->i_flag |= ISTALE; /* in case someone is looking it up */ 571 rw_exit(&ip->i_contents); 572 vp->v_vfsp = &EIO_vfs; 573 VN_RELE(vp); 574 return (error); 575 } 576 /* 577 * initialize the inode's dinode 578 */ 579 dp = (struct dinode *)(ioff + bp->b_un.b_addr); 580 ip->i_ic = dp->di_ic; /* structure assignment */ 581 brelse(bp); 582 583 /* 584 * Maintain compatibility with Solaris 1.x UFS 585 */ 586 if (ip->i_suid != UID_LONG) 587 ip->i_uid = ip->i_suid; 588 if (ip->i_sgid != GID_LONG) 589 ip->i_gid = ip->i_sgid; 590 591 ftype = ip->i_mode & IFMT; 592 if (ftype == IFBLK || ftype == IFCHR) { 593 dev_t dv; 594 uint_t top16 = ip->i_ordev & 0xffff0000u; 595 596 if (top16 == 0 || top16 == 0xffff0000u) 597 dv = expdev(ip->i_ordev); 598 else 599 dv = expldev(ip->i_ordev); 600 vp->v_rdev = ip->i_rdev = dv; 601 } 602 603 /* 604 * if our caller only expects allocated inodes, verify that 605 * this inode looks good; throw it out if it's bad. 606 */ 607 if (validate) { 608 if ((ftype == 0) || (ip->i_nlink <= 0)) { 609 ip->i_flag |= ISTALE; 610 rw_exit(&ip->i_contents); 611 vp->v_vfsp = &EIO_vfs; 612 VN_RELE(vp); 613 cmn_err(CE_NOTE, 614 "%s: unexpected free inode %d, run fsck(1M)%s", 615 fs->fs_fsmnt, (int)ino, 616 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 617 return (EIO); 618 } 619 } 620 621 /* 622 * Finish initializing the vnode, special handling for shadow inodes 623 * because IFTOVT() will produce a v_type of VNON which is not what we 624 * want, set v_type to VREG explicitly in that case. 625 */ 626 if (ftype == IFSHAD) { 627 vp->v_type = VREG; 628 } else { 629 vp->v_type = IFTOVT((mode_t)ip->i_mode); 630 } 631 632 ufs_reset_vnode(vp); 633 634 /* 635 * read the shadow 636 */ 637 if (ftype != 0 && ip->i_shadow != 0) { 638 if ((error = ufs_si_load(ip, cr)) != 0) { 639 ip->i_flag |= ISTALE; 640 ip->i_ufs_acl = NULL; 641 rw_exit(&ip->i_contents); 642 vp->v_vfsp = &EIO_vfs; 643 VN_RELE(vp); 644 return (error); 645 } 646 } 647 648 /* 649 * Only attach quota information if the inode has a type and if 650 * that type is not a shadow inode. 651 */ 652 if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) && 653 ((ip->i_mode & IFMT) != IFATTRDIR)) { 654 ip->i_dquot = getinoquota(ip); 655 } 656 TRANS_MATA_IGET(ufsvfsp, ip); 657 *ipp = ip; 658 rw_exit(&ip->i_contents); 659 660 return (0); 661 } 662 663 /* 664 * Vnode is no longer referenced, write the inode out 665 * and if necessary, truncate and deallocate the file. 666 */ 667 void 668 ufs_iinactive(struct inode *ip) 669 { 670 int front; 671 struct inode *iq; 672 struct inode *hip; 673 struct ufs_q *uq; 674 struct vnode *vp = ITOV(ip); 675 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 676 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 677 678 /* 679 * Because the vnode type might have been changed, 680 * the dnlc_dir_purge must be called unconditionally. 681 */ 682 dnlc_dir_purge(&ip->i_danchor); 683 684 /* 685 * Get exclusive access to inode data. 686 */ 687 rw_enter(&ip->i_contents, RW_WRITER); 688 ASSERT(ip->i_flag & IREF); 689 690 /* 691 * Make sure no one reclaimed the inode before we put it on 692 * the freelist or destroy it. We keep our 'hold' on the vnode 693 * from vn_rele until we are ready to do something with the inode. 694 * 695 * Pageout may put a VN_HOLD/VN_RELE at anytime during this 696 * operation via an async putpage, so we must make sure 697 * we don't free/destroy the inode more than once. ufs_iget 698 * may also put a VN_HOLD on the inode before it grabs 699 * the i_contents lock. This is done so we don't free 700 * an inode that a thread is waiting on. 701 */ 702 mutex_enter(&vp->v_lock); 703 704 if (vp->v_count > 1) { 705 VN_RELE_LOCKED(vp); 706 mutex_exit(&vp->v_lock); 707 rw_exit(&ip->i_contents); 708 return; 709 } 710 mutex_exit(&vp->v_lock); 711 712 /* 713 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed 714 * and clean. It can be safely destroyed (cyf). 715 */ 716 if (ip->i_ufsvfs == NULL) { 717 rw_exit(&ip->i_contents); 718 ufs_si_del(ip); 719 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 720 ufs_free_inode(ip); 721 return; 722 } 723 724 /* 725 * queue idle inode to appropriate thread. Will check v_count == 1 726 * prior to putting this on the appropriate queue. 727 * Stale inodes will be unhashed and freed by the ufs idle thread 728 * in ufs_idle_free() 729 */ 730 front = 1; 731 if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 && 732 ip->i_mode && ip->i_nlink <= 0) { 733 /* 734 * Mark the i_flag to indicate that inode is being deleted. 735 * This flag will be cleared when the deletion is complete. 736 * This prevents nfs from sneaking in via ufs_vget() while 737 * the delete is in progress (bugid 1242481). 738 */ 739 ip->i_flag |= IDEL; 740 741 /* 742 * NOIDEL means that deletes are not allowed at this time; 743 * whoever resets NOIDEL will also send this inode back 744 * through ufs_iinactive. IREF remains set. 745 */ 746 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 747 mutex_enter(&vp->v_lock); 748 VN_RELE_LOCKED(vp); 749 mutex_exit(&vp->v_lock); 750 rw_exit(&ip->i_contents); 751 return; 752 } 753 if (!TRANS_ISTRANS(ip->i_ufsvfs)) { 754 rw_exit(&ip->i_contents); 755 ufs_delete(ip->i_ufsvfs, ip, 0); 756 return; 757 } 758 759 /* queue to delete thread; IREF remains set */ 760 ins.in_qfree.value.ul++; 761 uq = &ip->i_ufsvfs->vfs_delete; 762 763 mutex_enter(&uq->uq_mutex); 764 765 /* add to q */ 766 if ((iq = uq->uq_ihead) != 0) { 767 ip->i_freef = iq; 768 ip->i_freeb = iq->i_freeb; 769 iq->i_freeb->i_freef = ip; 770 iq->i_freeb = ip; 771 if (front) 772 uq->uq_ihead = ip; 773 } else { 774 uq->uq_ihead = ip; 775 ip->i_freef = ip; 776 ip->i_freeb = ip; 777 } 778 779 delq_info->delq_unreclaimed_files += 1; 780 delq_info->delq_unreclaimed_blocks += ip->i_blocks; 781 } else { 782 /* 783 * queue to idle thread 784 * Check the v_count == 1 again. 785 * 786 */ 787 mutex_enter(&vp->v_lock); 788 if (vp->v_count > 1) { 789 VN_RELE_LOCKED(vp); 790 mutex_exit(&vp->v_lock); 791 rw_exit(&ip->i_contents); 792 return; 793 } 794 mutex_exit(&vp->v_lock); 795 uq = &ufs_idle_q; 796 797 /* 798 * useful iff it has pages or is a fastsymlink; otherwise junk 799 */ 800 mutex_enter(&uq->uq_mutex); 801 802 /* clear IREF means `on idle list' */ 803 ip->i_flag &= ~(IREF | IDIRECTIO); 804 805 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) { 806 ins.in_frback.value.ul++; 807 hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)]; 808 ufs_nuseful_iq++; 809 } else { 810 ins.in_frfront.value.ul++; 811 hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)]; 812 ip->i_flag |= IJUNKIQ; 813 ufs_njunk_iq++; 814 } 815 ip->i_freef = hip; 816 ip->i_freeb = hip->i_freeb; 817 hip->i_freeb->i_freef = ip; 818 hip->i_freeb = ip; 819 } 820 821 /* wakeup thread(s) if q is overfull */ 822 if (++uq->uq_ne == uq->uq_lowat) 823 cv_broadcast(&uq->uq_cv); 824 825 /* all done, release the q and inode */ 826 mutex_exit(&uq->uq_mutex); 827 rw_exit(&ip->i_contents); 828 } 829 830 /* 831 * Check accessed and update flags on an inode structure. 832 * If any are on, update the inode with the (unique) current time. 833 * If waitfor is given, insure I/O order so wait for write to complete. 834 */ 835 void 836 ufs_iupdat(struct inode *ip, int waitfor) 837 { 838 struct buf *bp; 839 struct fs *fp; 840 struct dinode *dp; 841 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 842 int i; 843 int do_trans_times; 844 ushort_t flag; 845 o_uid_t suid; 846 o_gid_t sgid; 847 848 /* 849 * This function is now safe to be called with either the reader 850 * or writer i_contents lock. 851 */ 852 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 853 854 /* 855 * Return if file system has been forcibly umounted. 856 */ 857 if (ufsvfsp == NULL) 858 return; 859 860 flag = ip->i_flag; /* Atomic read */ 861 /* 862 * We better not update the disk inode from a stale inode. 863 */ 864 if (flag & ISTALE) 865 return; 866 867 fp = ip->i_fs; 868 869 if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) { 870 if (fp->fs_ronly) { 871 mutex_enter(&ip->i_tlock); 872 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 873 mutex_exit(&ip->i_tlock); 874 return; 875 } 876 /* 877 * fs is active while metadata is being written 878 */ 879 mutex_enter(&ufsvfsp->vfs_lock); 880 ufs_notclean(ufsvfsp); 881 /* 882 * get the dinode 883 */ 884 bp = UFS_BREAD(ufsvfsp, ip->i_dev, 885 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)), 886 (int)fp->fs_bsize); 887 if (bp->b_flags & B_ERROR) { 888 mutex_enter(&ip->i_tlock); 889 ip->i_flag &= 890 ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 891 mutex_exit(&ip->i_tlock); 892 brelse(bp); 893 return; 894 } 895 /* 896 * munge inode fields 897 */ 898 mutex_enter(&ip->i_tlock); 899 ITIMES_NOLOCK(ip); 900 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC); 901 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 902 mutex_exit(&ip->i_tlock); 903 904 /* 905 * For reads and concurrent re-writes, no deltas were 906 * entered for the access time changes - do it now. 907 */ 908 if (do_trans_times) { 909 TRANS_INODE_TIMES(ufsvfsp, ip); 910 } 911 912 /* 913 * For SunOS 5.0->5.4, these lines below read: 914 * 915 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 916 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 917 * 918 * where MAXUID was set to 60002. This was incorrect - 919 * the uids should have been constrained to what fitted into 920 * a 16-bit word. 921 * 922 * This means that files from 4.x filesystems that have an 923 * i_suid field larger than 60002 will have that field 924 * changed to 65535. 925 * 926 * Security note: 4.x UFS could never create a i_suid of 927 * UID_LONG since that would've corresponded to -1. 928 */ 929 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 930 UID_LONG : ip->i_uid; 931 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 932 GID_LONG : ip->i_gid; 933 934 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) { 935 ip->i_suid = suid; 936 ip->i_sgid = sgid; 937 TRANS_INODE(ufsvfsp, ip); 938 } 939 940 if ((ip->i_mode & IFMT) == IFBLK || 941 (ip->i_mode & IFMT) == IFCHR) { 942 dev_t d = ip->i_rdev; 943 dev32_t dev32; 944 945 /* 946 * load first direct block only if special device 947 */ 948 if (!cmpldev(&dev32, d)) { 949 /* 950 * We panic here because there's "no way" 951 * we should have been able to create a large 952 * inode with a large dev_t. Earlier layers 953 * should've caught this. 954 */ 955 panic("ip %p: i_rdev too big", (void *)ip); 956 } 957 958 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 959 ip->i_ordev = dev32; /* can't use old fmt. */ 960 } else { 961 ip->i_ordev = cmpdev(d); 962 } 963 } 964 965 /* 966 * copy inode to dinode (zero fastsymlnk in dinode) 967 */ 968 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number); 969 dp->di_ic = ip->i_ic; /* structure assignment */ 970 if (flag & IFASTSYMLNK) { 971 for (i = 1; i < NDADDR; i++) 972 dp->di_db[i] = 0; 973 for (i = 0; i < NIADDR; i++) 974 dp->di_ib[i] = 0; 975 } 976 if (TRANS_ISTRANS(ufsvfsp)) { 977 /* 978 * Pass only a sector size buffer containing 979 * the inode, otherwise when the buffer is copied 980 * into a cached roll buffer then too much memory 981 * gets consumed if 8KB inode buffers are passed. 982 */ 983 TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff, 984 sizeof (struct dinode), 985 (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE), 986 DEV_BSIZE); 987 988 brelse(bp); 989 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) { 990 UFS_BRWRITE(ufsvfsp, bp); 991 992 /* 993 * Synchronous write has guaranteed that inode 994 * has been written on disk so clear the flag 995 */ 996 mutex_enter(&ip->i_tlock); 997 ip->i_flag &= ~IBDWRITE; 998 mutex_exit(&ip->i_tlock); 999 } else { 1000 bdrwrite(bp); 1001 1002 /* 1003 * This write hasn't guaranteed that inode has been 1004 * written on the disk. 1005 * Since, all updat flags on inode are cleared, we must 1006 * remember the condition in case inode is to be updated 1007 * synchronously later (e.g.- fsync()/fdatasync()) 1008 * and inode has not been modified yet. 1009 */ 1010 mutex_enter(&ip->i_tlock); 1011 ip->i_flag |= IBDWRITE; 1012 mutex_exit(&ip->i_tlock); 1013 } 1014 } else { 1015 /* 1016 * In case previous inode update was done asynchronously 1017 * (IBDWRITE) and this inode update request wants guaranteed 1018 * (synchronous) disk update, flush the inode. 1019 */ 1020 if (waitfor && (flag & IBDWRITE)) { 1021 blkflush(ip->i_dev, 1022 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number))); 1023 mutex_enter(&ip->i_tlock); 1024 ip->i_flag &= ~IBDWRITE; 1025 mutex_exit(&ip->i_tlock); 1026 } 1027 } 1028 } 1029 1030 #define SINGLE 0 /* index of single indirect block */ 1031 #define DOUBLE 1 /* index of double indirect block */ 1032 #define TRIPLE 2 /* index of triple indirect block */ 1033 1034 /* 1035 * Release blocks associated with the inode ip and 1036 * stored in the indirect block bn. Blocks are free'd 1037 * in LIFO order up to (but not including) lastbn. If 1038 * level is greater than SINGLE, the block is an indirect 1039 * block and recursive calls to indirtrunc must be used to 1040 * cleanse other indirect blocks. 1041 * 1042 * N.B.: triple indirect blocks are untested. 1043 */ 1044 static long 1045 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags) 1046 { 1047 int i; 1048 struct buf *bp, *copy; 1049 daddr32_t *bap; 1050 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1051 struct fs *fs = ufsvfsp->vfs_fs; 1052 daddr_t nb, last; 1053 long factor; 1054 int blocksreleased = 0, nblocks; 1055 1056 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 1057 /* 1058 * Calculate index in current block of last 1059 * block to be kept. -1 indicates the entire 1060 * block so we need not calculate the index. 1061 */ 1062 factor = 1; 1063 for (i = SINGLE; i < level; i++) 1064 factor *= NINDIR(fs); 1065 last = lastbn; 1066 if (lastbn > 0) 1067 last /= factor; 1068 nblocks = btodb(fs->fs_bsize); 1069 /* 1070 * Get buffer of block pointers, zero those 1071 * entries corresponding to blocks to be free'd, 1072 * and update on disk copy first. 1073 * *Unless* the root pointer has been synchronously 1074 * written to disk. If nothing points to this 1075 * indirect block then don't bother zero'ing and 1076 * writing it. 1077 */ 1078 bp = UFS_BREAD(ufsvfsp, 1079 ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize); 1080 if (bp->b_flags & B_ERROR) { 1081 brelse(bp); 1082 return (0); 1083 } 1084 bap = bp->b_un.b_daddr; 1085 if ((flags & I_CHEAP) == 0) { 1086 uint_t zb; 1087 1088 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t)); 1089 1090 if (zb) { 1091 /* 1092 * push any data into the log before we zero it 1093 */ 1094 if (bp->b_flags & B_DELWRI) 1095 TRANS_LOG(ufsvfsp, (caddr_t)bap, 1096 ldbtob(bp->b_blkno), bp->b_bcount, 1097 bp->b_un.b_addr, bp->b_bcount); 1098 copy = ngeteblk(fs->fs_bsize); 1099 bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr, 1100 (uint_t)fs->fs_bsize); 1101 bzero((caddr_t)&bap[last + 1], zb); 1102 1103 TRANS_BUF(ufsvfsp, 1104 (caddr_t)&bap[last + 1] - (caddr_t)bap, 1105 zb, bp, DT_ABZERO); 1106 1107 UFS_BRWRITE(ufsvfsp, bp); 1108 bp = copy, bap = bp->b_un.b_daddr; 1109 } 1110 } else { 1111 /* make sure write retries are also cleared */ 1112 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI); 1113 bp->b_flags |= B_STALE | B_AGE; 1114 } 1115 1116 /* 1117 * Recursively free totally unused blocks. 1118 */ 1119 flags |= I_CHEAP; 1120 for (i = NINDIR(fs) - 1; i > last; i--) { 1121 nb = bap[i]; 1122 if (nb == 0) 1123 continue; 1124 if (level > SINGLE) { 1125 blocksreleased += 1126 indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags); 1127 free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK); 1128 } else 1129 free(ip, nb, (off_t)fs->fs_bsize, flags); 1130 blocksreleased += nblocks; 1131 } 1132 flags &= ~I_CHEAP; 1133 1134 /* 1135 * Recursively free last partial block. 1136 */ 1137 if (level > SINGLE && lastbn >= 0) { 1138 last = lastbn % factor; 1139 nb = bap[i]; 1140 if (nb != 0) 1141 blocksreleased += 1142 indirtrunc(ip, nb, last, level - 1, flags); 1143 } 1144 brelse(bp); 1145 return (blocksreleased); 1146 } 1147 1148 /* 1149 * Truncate the inode ip to at most length size. 1150 * Free affected disk blocks -- the blocks of the 1151 * file are removed in reverse order. 1152 * 1153 * N.B.: triple indirect blocks are untested. 1154 */ 1155 static int i_genrand = 1234; 1156 int 1157 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr) 1158 { 1159 struct fs *fs = oip->i_fs; 1160 struct ufsvfs *ufsvfsp = oip->i_ufsvfs; 1161 struct inode *ip; 1162 daddr_t lastblock; 1163 off_t bsize; 1164 int boff; 1165 daddr_t bn, lastiblock[NIADDR]; 1166 int level; 1167 long nblocks, blocksreleased = 0; 1168 int i; 1169 ushort_t mode; 1170 struct inode tip; 1171 int err; 1172 u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ? 1173 (UFS_MAXOFFSET_T) : (MAXOFF32_T); 1174 1175 /* 1176 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most 1177 * other uses need the reader lock. opendq() holds the writer lock. 1178 */ 1179 ASSERT((oip->i_mode & IFMT) == IFSHAD || 1180 RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 1181 ASSERT(RW_WRITE_HELD(&oip->i_contents)); 1182 /* 1183 * We only allow truncation of regular files and directories 1184 * to arbitrary lengths here. In addition, we allow symbolic 1185 * links to be truncated only to zero length. Other inode 1186 * types cannot have their length set here. Disk blocks are 1187 * being dealt with - especially device inodes where 1188 * ip->i_ordev is actually being stored in ip->i_db[0]! 1189 */ 1190 TRANS_INODE(ufsvfsp, oip); 1191 mode = oip->i_mode & IFMT; 1192 if (flags & I_FREE) { 1193 i_genrand *= 16843009; /* turns into shift and adds */ 1194 i_genrand++; 1195 oip->i_gen += ((i_genrand + ddi_get_lbolt()) & 0xffff) + 1; 1196 oip->i_flag |= ICHG |IUPD; 1197 oip->i_seq++; 1198 if (length == oip->i_size) 1199 return (0); 1200 flags |= I_CHEAP; 1201 } 1202 if (mode == IFIFO) 1203 return (0); 1204 if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR && 1205 !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD) 1206 return (EINVAL); 1207 if (length > maxoffset) 1208 return (EFBIG); 1209 if ((mode == IFDIR) || (mode == IFATTRDIR)) 1210 flags |= I_DIR; 1211 if (mode == IFSHAD) 1212 flags |= I_SHAD; 1213 if (oip == ufsvfsp->vfs_qinod) 1214 flags |= I_QUOTA; 1215 if (length == oip->i_size) { 1216 /* update ctime and mtime to please POSIX tests */ 1217 oip->i_flag |= ICHG |IUPD; 1218 oip->i_seq++; 1219 if (length == 0) { 1220 /* nothing to cache so clear the flag */ 1221 oip->i_flag &= ~IFASTSYMLNK; 1222 } 1223 return (0); 1224 } 1225 /* wipe out fast symlink till next access */ 1226 if (oip->i_flag & IFASTSYMLNK) { 1227 int j; 1228 1229 ASSERT(ITOV(oip)->v_type == VLNK); 1230 1231 oip->i_flag &= ~IFASTSYMLNK; 1232 1233 for (j = 1; j < NDADDR; j++) 1234 oip->i_db[j] = 0; 1235 for (j = 0; j < NIADDR; j++) 1236 oip->i_ib[j] = 0; 1237 } 1238 1239 boff = (int)blkoff(fs, length); 1240 1241 if (length > oip->i_size) { 1242 /* 1243 * Trunc up case. BMAPALLOC will insure that the right blocks 1244 * are allocated. This includes extending the old frag to a 1245 * full block (if needed) in addition to doing any work 1246 * needed for allocating the last block. 1247 */ 1248 if (boff == 0) 1249 err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr); 1250 else 1251 err = BMAPALLOC(oip, length - 1, boff, cr); 1252 1253 if (err == 0) { 1254 /* 1255 * Save old size and set inode's size now 1256 * so that we don't cause too much of the 1257 * file to be zero'd and pushed. 1258 */ 1259 u_offset_t osize = oip->i_size; 1260 oip->i_size = length; 1261 /* 1262 * Make sure we zero out the remaining bytes of 1263 * the page in case a mmap scribbled on it. We 1264 * can't prevent a mmap from writing beyond EOF 1265 * on the last page of a file. 1266 * 1267 */ 1268 if ((boff = (int)blkoff(fs, osize)) != 0) { 1269 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ? 1270 fs->fs_bsize : fragroundup(fs, boff); 1271 pvn_vpzero(ITOV(oip), osize, 1272 (size_t)(bsize - boff)); 1273 } 1274 oip->i_flag |= ICHG|IATTCHG; 1275 oip->i_seq++; 1276 ITIMES_NOLOCK(oip); 1277 /* 1278 * MAXOFF32_T is old 2GB size limit. If 1279 * this operation caused a large file to be 1280 * created, turn on the superblock flag 1281 * and update the superblock, if the flag 1282 * is not already on. 1283 */ 1284 if ((length > (u_offset_t)MAXOFF32_T) && 1285 !(fs->fs_flags & FSLARGEFILES)) { 1286 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1287 mutex_enter(&ufsvfsp->vfs_lock); 1288 fs->fs_flags |= FSLARGEFILES; 1289 ufs_sbwrite(ufsvfsp); 1290 mutex_exit(&ufsvfsp->vfs_lock); 1291 } 1292 } 1293 1294 return (err); 1295 } 1296 1297 /* 1298 * Update the pages of the file. If the file is not being 1299 * truncated to a block boundary, the contents of the 1300 * pages following the end of the file must be zero'ed 1301 * in case it ever become accessible again because 1302 * of subsequent file growth. 1303 */ 1304 if (boff == 0) { 1305 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage, 1306 B_INVAL | B_TRUNC, CRED()); 1307 } else { 1308 /* 1309 * Make sure that the last block is properly allocated. 1310 * We only really have to do this if the last block is 1311 * actually allocated since ufs_bmap will now handle the case 1312 * of an fragment which has no block allocated. Just to 1313 * be sure, we do it now independent of current allocation. 1314 */ 1315 err = BMAPALLOC(oip, length - 1, boff, cr); 1316 if (err) 1317 return (err); 1318 1319 /* 1320 * BMAPALLOC will call bmap_write which defers i_seq 1321 * processing. If the timestamps were changed, update 1322 * i_seq before rdip drops i_contents or syncs the inode. 1323 */ 1324 if (oip->i_flag & (ICHG|IUPD)) 1325 oip->i_seq++; 1326 1327 /* 1328 * BugId 4069932 1329 * Make sure that the relevant partial page appears in 1330 * the v_pages list, so that pvn_vpzero() will do its 1331 * job. Since doing this correctly requires everything 1332 * in rdip() except for the uiomove(), it's easier and 1333 * safer to do the uiomove() rather than duplicate the 1334 * rest of rdip() here. 1335 * 1336 * To get here, we know that length indicates a byte 1337 * that is not the first byte of a block. (length - 1) 1338 * is the last actual byte known to exist. Deduction 1339 * shows it is in the same block as byte (length). 1340 * Thus, this rdip() invocation should always succeed 1341 * except in the face of i/o errors, and give us the 1342 * block we care about. 1343 * 1344 * rdip() makes the same locking assertions and 1345 * assumptions as we do. We do not acquire any locks 1346 * before calling it, so we have not changed the locking 1347 * situation. Finally, there do not appear to be any 1348 * paths whereby rdip() ends up invoking us again. 1349 * Thus, infinite recursion is avoided. 1350 */ 1351 { 1352 uio_t uio; 1353 iovec_t iov[1]; 1354 char buffer; 1355 1356 uio.uio_iov = iov; 1357 uio.uio_iovcnt = 1; 1358 uio.uio_loffset = length - 1; 1359 uio.uio_resid = 1; 1360 uio.uio_segflg = UIO_SYSSPACE; 1361 uio.uio_extflg = UIO_COPY_CACHED; 1362 1363 iov[0].iov_base = &buffer; 1364 iov[0].iov_len = 1; 1365 1366 err = rdip(oip, &uio, UIO_READ, NULL); 1367 if (err) 1368 return (err); 1369 } 1370 1371 bsize = (int)lblkno(fs, length - 1) >= NDADDR ? 1372 fs->fs_bsize : fragroundup(fs, boff); 1373 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff)); 1374 /* 1375 * Ensure full fs block is marked as dirty. 1376 */ 1377 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff), 1378 ufs_putapage, B_INVAL | B_TRUNC, CRED()); 1379 } 1380 1381 /* 1382 * Calculate index into inode's block list of 1383 * last direct and indirect blocks (if any) 1384 * which we want to keep. Lastblock is -1 when 1385 * the file is truncated to 0. 1386 */ 1387 lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; 1388 lastiblock[SINGLE] = lastblock - NDADDR; 1389 lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); 1390 lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); 1391 nblocks = btodb(fs->fs_bsize); 1392 1393 /* 1394 * Update file and block pointers 1395 * on disk before we start freeing blocks. 1396 * If we crash before free'ing blocks below, 1397 * the blocks will be returned to the free list. 1398 * lastiblock values are also normalized to -1 1399 * for calls to indirtrunc below. 1400 */ 1401 tip = *oip; /* structure copy */ 1402 ip = &tip; 1403 1404 for (level = TRIPLE; level >= SINGLE; level--) 1405 if (lastiblock[level] < 0) { 1406 oip->i_ib[level] = 0; 1407 lastiblock[level] = -1; 1408 } 1409 for (i = NDADDR - 1; i > lastblock; i--) { 1410 oip->i_db[i] = 0; 1411 flags |= I_CHEAP; 1412 } 1413 oip->i_size = length; 1414 oip->i_flag |= ICHG|IUPD|IATTCHG; 1415 oip->i_seq++; 1416 if (!TRANS_ISTRANS(ufsvfsp)) 1417 ufs_iupdat(oip, I_SYNC); /* do sync inode update */ 1418 1419 /* 1420 * Indirect blocks first. 1421 */ 1422 for (level = TRIPLE; level >= SINGLE; level--) { 1423 bn = ip->i_ib[level]; 1424 if (bn != 0) { 1425 blocksreleased += 1426 indirtrunc(ip, bn, lastiblock[level], level, flags); 1427 if (lastiblock[level] < 0) { 1428 ip->i_ib[level] = 0; 1429 free(ip, bn, (off_t)fs->fs_bsize, 1430 flags | I_IBLK); 1431 blocksreleased += nblocks; 1432 } 1433 } 1434 if (lastiblock[level] >= 0) 1435 goto done; 1436 } 1437 1438 /* 1439 * All whole direct blocks or frags. 1440 */ 1441 for (i = NDADDR - 1; i > lastblock; i--) { 1442 bn = ip->i_db[i]; 1443 if (bn == 0) 1444 continue; 1445 ip->i_db[i] = 0; 1446 bsize = (off_t)blksize(fs, ip, i); 1447 free(ip, bn, bsize, flags); 1448 blocksreleased += btodb(bsize); 1449 } 1450 if (lastblock < 0) 1451 goto done; 1452 1453 /* 1454 * Finally, look for a change in size of the 1455 * last direct block; release any frags. 1456 */ 1457 bn = ip->i_db[lastblock]; 1458 if (bn != 0) { 1459 off_t oldspace, newspace; 1460 1461 /* 1462 * Calculate amount of space we're giving 1463 * back as old block size minus new block size. 1464 */ 1465 oldspace = blksize(fs, ip, lastblock); 1466 UFS_SET_ISIZE(length, ip); 1467 newspace = blksize(fs, ip, lastblock); 1468 if (newspace == 0) { 1469 err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0"); 1470 return (err); 1471 } 1472 if (oldspace - newspace > 0) { 1473 /* 1474 * Block number of space to be free'd is 1475 * the old block # plus the number of frags 1476 * required for the storage we're keeping. 1477 */ 1478 bn += numfrags(fs, newspace); 1479 free(ip, bn, oldspace - newspace, flags); 1480 blocksreleased += btodb(oldspace - newspace); 1481 } 1482 } 1483 done: 1484 /* BEGIN PARANOIA */ 1485 for (level = SINGLE; level <= TRIPLE; level++) 1486 if (ip->i_ib[level] != oip->i_ib[level]) { 1487 err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block"); 1488 return (err); 1489 } 1490 1491 for (i = 0; i < NDADDR; i++) 1492 if (ip->i_db[i] != oip->i_db[i]) { 1493 err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block"); 1494 return (err); 1495 } 1496 /* END PARANOIA */ 1497 oip->i_blocks -= blocksreleased; 1498 1499 if (oip->i_blocks < 0) { /* sanity */ 1500 cmn_err(CE_NOTE, 1501 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n", 1502 fs->fs_fsmnt, (int)oip->i_number, oip->i_size, 1503 (int)oip->i_blocks); 1504 oip->i_blocks = 0; 1505 } 1506 oip->i_flag |= ICHG|IATTCHG; 1507 oip->i_seq++; 1508 /* blocksreleased is >= zero, so this can not fail */ 1509 (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL, 1510 (size_t *)NULL); 1511 return (0); 1512 } 1513 1514 /* 1515 * Check mode permission on inode. Mode is READ, WRITE or EXEC. 1516 * In the case of WRITE, the read-only status of the file system 1517 * is checked. Depending on the calling user, the appropriate 1518 * mode bits are selected; privileges to override missing permission 1519 * bits are checked through secpolicy_vnode_access(). 1520 * The i_contens lock must be held as reader here to prevent racing with 1521 * the acl subsystem removing/setting/changing acls on this inode. 1522 * The caller is responsible for indicating whether or not the i_contents 1523 * lock needs to be acquired here or if already held. 1524 */ 1525 int 1526 ufs_iaccess(struct inode *ip, int mode, struct cred *cr, int dolock) 1527 { 1528 int shift = 0; 1529 int ret = 0; 1530 1531 if (dolock) 1532 rw_enter(&ip->i_contents, RW_READER); 1533 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 1534 1535 if (mode & IWRITE) { 1536 /* 1537 * Disallow write attempts on read-only 1538 * file systems, unless the file is a block 1539 * or character device or a FIFO. 1540 */ 1541 if (ip->i_fs->fs_ronly != 0) { 1542 if ((ip->i_mode & IFMT) != IFCHR && 1543 (ip->i_mode & IFMT) != IFBLK && 1544 (ip->i_mode & IFMT) != IFIFO) { 1545 ret = EROFS; 1546 goto out; 1547 } 1548 } 1549 } 1550 /* 1551 * If there is an acl, check the acl and return. 1552 */ 1553 if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) { 1554 ret = ufs_acl_access(ip, mode, cr); 1555 goto out; 1556 } 1557 1558 /* 1559 * Access check is based on only one of owner, group, public. 1560 * If not owner, then check group. 1561 * If not a member of the group, then check public access. 1562 */ 1563 if (crgetuid(cr) != ip->i_uid) { 1564 shift += 3; 1565 if (!groupmember((uid_t)ip->i_gid, cr)) 1566 shift += 3; 1567 } 1568 1569 /* test missing privilege bits */ 1570 ret = secpolicy_vnode_access2(cr, ITOV(ip), ip->i_uid, 1571 ip->i_mode << shift, mode); 1572 out: 1573 if (dolock) 1574 rw_exit(&ip->i_contents); 1575 return (ret); 1576 } 1577 1578 /* 1579 * if necessary, remove an inode from the free list 1580 * i_contents is held except at unmount 1581 * 1582 * Return 1 if the inode is taken off of the ufs_idle_q, 1583 * and the caller is expected to call VN_RELE. 1584 * 1585 * Return 0 otherwise. 1586 */ 1587 int 1588 ufs_rmidle(struct inode *ip) 1589 { 1590 int rval = 0; 1591 1592 mutex_enter(&ip->i_tlock); 1593 if ((ip->i_flag & IREF) == 0) { 1594 mutex_enter(&ufs_idle_q.uq_mutex); 1595 ip->i_freef->i_freeb = ip->i_freeb; 1596 ip->i_freeb->i_freef = ip->i_freef; 1597 ip->i_freef = ip; 1598 ip->i_freeb = ip; 1599 ip->i_flag |= IREF; 1600 ufs_idle_q.uq_ne--; 1601 if (ip->i_flag & IJUNKIQ) { 1602 ufs_njunk_iq--; 1603 ip->i_flag &= ~IJUNKIQ; 1604 } else { 1605 ufs_nuseful_iq--; 1606 } 1607 mutex_exit(&ufs_idle_q.uq_mutex); 1608 rval = 1; 1609 } 1610 mutex_exit(&ip->i_tlock); 1611 return (rval); 1612 } 1613 1614 /* 1615 * scan the hash of inodes and call func with the inode locked 1616 */ 1617 int 1618 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg, 1619 struct ufsvfs *ufsvfsp) 1620 { 1621 struct inode *ip; /* current inode */ 1622 struct inode *lip = NULL; /* last/previous inode */ 1623 union ihead *ih; /* current hash chain */ 1624 int error, i; 1625 int saverror = 0; 1626 int lip_held; /* lip needs a VN_RELE() */ 1627 1628 /* 1629 * If ufsvfsp is NULL, then our caller should be holding 1630 * ufs_scan_lock to avoid conflicts between ufs_unmount() and 1631 * ufs_update(). Otherwise, to avoid false-positives in 1632 * ufs_unmount()'s v_count-based EBUSY check, we only hold 1633 * those inodes that are in the file system our caller cares 1634 * about. 1635 * 1636 * We know that ip is a valid inode in the hash chain (and thus 1637 * we can trust i_ufsvfs) because the inode we chained from 1638 * (lip) is still in the hash chain. This is true because either: 1639 * 1640 * 1. We did not drop the hash chain lock since the last 1641 * iteration (because we were not interested in the last inode), 1642 * or 1643 * 2. We maintained a hold on the last inode while we 1644 * we were processing it, so it could not be removed 1645 * from the hash chain. 1646 * 1647 * The whole reason we're dropping and re-grabbing the chain 1648 * lock on every inode is so that we don't present a major 1649 * choke point on throughput, particularly when we've been 1650 * called on behalf of fsflush. 1651 */ 1652 1653 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 1654 mutex_enter(&ih_lock[i]); 1655 for (ip = ih->ih_chain[0], lip_held = 0; 1656 ip != (struct inode *)ih; 1657 ip = lip->i_forw) { 1658 1659 ins.in_scan.value.ul++; 1660 1661 /* 1662 * Undo the previous iteration's VN_HOLD(), but 1663 * only if one was done. 1664 */ 1665 if (lip_held) 1666 VN_RELE(ITOV(lip)); 1667 1668 lip = ip; 1669 if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) { 1670 /* 1671 * We're not processing all inodes, and 1672 * this inode is not in the filesystem of 1673 * interest, so skip it. No need to do a 1674 * VN_HOLD() since we're not dropping the 1675 * hash chain lock until after we've 1676 * done the i_forw traversal above. 1677 */ 1678 lip_held = 0; 1679 continue; 1680 } 1681 VN_HOLD(ITOV(ip)); 1682 lip_held = 1; 1683 mutex_exit(&ih_lock[i]); 1684 1685 /* 1686 * Acquire the contents lock as writer to make 1687 * sure that the inode has been initialized in 1688 * the cache or removed from the idle list by 1689 * ufs_iget(). This works because ufs_iget() 1690 * acquires the contents lock before putting 1691 * the inode into the cache. If we can lock 1692 * it, then ufs_iget() is done with it. 1693 */ 1694 1695 if (rwtry) { 1696 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) { 1697 mutex_enter(&ih_lock[i]); 1698 continue; 1699 } 1700 } else { 1701 rw_enter(&ip->i_contents, RW_WRITER); 1702 } 1703 1704 rw_exit(&ip->i_contents); 1705 1706 /* 1707 * ISTALE means the inode couldn't be read 1708 * 1709 * We don't have to hold the i_contents lock 1710 * for this check for a couple of 1711 * reasons. First, if ISTALE is set then the 1712 * flag cannot be cleared until the inode is 1713 * removed from the cache and that cannot 1714 * happen until after we VN_RELE() it. 1715 * Second, if ISTALE is not set, then the 1716 * inode is in the cache and does not need to 1717 * be read from disk so ISTALE cannot be set 1718 * while we are not looking. 1719 */ 1720 if ((ip->i_flag & ISTALE) == 0) { 1721 if ((error = (*func)(ip, arg)) != 0) 1722 saverror = error; 1723 } 1724 1725 mutex_enter(&ih_lock[i]); 1726 } 1727 if (lip_held) 1728 VN_RELE(ITOV(lip)); 1729 mutex_exit(&ih_lock[i]); 1730 } 1731 return (saverror); 1732 } 1733 1734 /* 1735 * Mark inode with the current time, plus a unique increment. 1736 * 1737 * Since we only keep 32-bit time on disk, if UFS is still alive 1738 * beyond 2038, filesystem times will simply stick at the last 1739 * possible second of 32-bit time. Not ideal, but probably better 1740 * than going into the remote past, or confusing applications with 1741 * negative time. 1742 */ 1743 void 1744 ufs_imark(struct inode *ip) 1745 { 1746 timestruc_t now; 1747 int32_t usec, nsec; 1748 1749 /* 1750 * The update of i_seq may have been deferred, increase i_seq here 1751 * to make sure it is in sync with the timestamps. 1752 */ 1753 if (ip->i_flag & ISEQ) { 1754 ASSERT(ip->i_flag & (IUPD|ICHG)); 1755 ip->i_seq++; 1756 ip->i_flag &= ~ISEQ; 1757 } 1758 1759 gethrestime(&now); 1760 1761 /* 1762 * Fast algorithm to convert nsec to usec -- see hrt2ts() 1763 * in common/os/timers.c for a full description. 1764 */ 1765 nsec = now.tv_nsec; 1766 usec = nsec + (nsec >> 2); 1767 usec = nsec + (usec >> 1); 1768 usec = nsec + (usec >> 2); 1769 usec = nsec + (usec >> 4); 1770 usec = nsec - (usec >> 3); 1771 usec = nsec + (usec >> 2); 1772 usec = nsec + (usec >> 3); 1773 usec = nsec + (usec >> 4); 1774 usec = nsec + (usec >> 1); 1775 usec = nsec + (usec >> 6); 1776 usec = usec >> 10; 1777 1778 mutex_enter(&ufs_iuniqtime_lock); 1779 if (now.tv_sec > (time_t)iuniqtime.tv_sec || 1780 usec > iuniqtime.tv_usec) { 1781 if (now.tv_sec < TIME32_MAX) { 1782 iuniqtime.tv_sec = (time32_t)now.tv_sec; 1783 iuniqtime.tv_usec = usec; 1784 } 1785 } else { 1786 if (iuniqtime.tv_sec < TIME32_MAX) { 1787 iuniqtime.tv_usec++; 1788 /* Check for usec overflow */ 1789 if (iuniqtime.tv_usec >= MICROSEC) { 1790 iuniqtime.tv_sec++; 1791 iuniqtime.tv_usec = 0; 1792 } 1793 } 1794 } 1795 1796 if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) { 1797 ip->i_atime = iuniqtime; 1798 } 1799 if (ip->i_flag & IUPD) { 1800 ip->i_mtime = iuniqtime; 1801 ip->i_flag |= IMODTIME; 1802 } 1803 if (ip->i_flag & ICHG) { 1804 ip->i_diroff = 0; 1805 ip->i_ctime = iuniqtime; 1806 } 1807 mutex_exit(&ufs_iuniqtime_lock); 1808 } 1809 1810 /* 1811 * Update timestamps in inode. 1812 */ 1813 void 1814 ufs_itimes_nolock(struct inode *ip) 1815 { 1816 1817 /* 1818 * if noatime is set and the inode access time is the only field that 1819 * must be changed, exit immediately. 1820 */ 1821 if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) && 1822 (ip->i_ufsvfs->vfs_noatime)) { 1823 return; 1824 } 1825 1826 if (ip->i_flag & (IUPD|IACC|ICHG)) { 1827 if (ip->i_flag & ICHG) 1828 ip->i_flag |= IMOD; 1829 else 1830 ip->i_flag |= IMODACC; 1831 ufs_imark(ip); 1832 ip->i_flag &= ~(IACC|IUPD|ICHG); 1833 } 1834 } 1835