1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 #include <sys/types.h> 43 #include <sys/t_lock.h> 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/uio.h> 47 #include <sys/bitmap.h> 48 #include <sys/signal.h> 49 #include <sys/cred.h> 50 #include <sys/user.h> 51 #include <sys/vfs.h> 52 #include <sys/stat.h> 53 #include <sys/vnode.h> 54 #include <sys/buf.h> 55 #include <sys/proc.h> 56 #include <sys/disp.h> 57 #include <sys/dnlc.h> 58 #include <sys/mode.h> 59 #include <sys/cmn_err.h> 60 #include <sys/kstat.h> 61 #include <sys/acl.h> 62 #include <sys/var.h> 63 #include <sys/fs/ufs_inode.h> 64 #include <sys/fs/ufs_fs.h> 65 #include <sys/fs/ufs_trans.h> 66 #include <sys/fs/ufs_acl.h> 67 #include <sys/fs/ufs_bio.h> 68 #include <sys/fs/ufs_quota.h> 69 #include <sys/fs/ufs_log.h> 70 #include <vm/hat.h> 71 #include <vm/as.h> 72 #include <vm/pvn.h> 73 #include <vm/seg.h> 74 #include <sys/swap.h> 75 #include <sys/cpuvar.h> 76 #include <sys/sysmacros.h> 77 #include <sys/errno.h> 78 #include <sys/kmem.h> 79 #include <sys/debug.h> 80 #include <fs/fs_subr.h> 81 #include <sys/policy.h> 82 83 struct kmem_cache *inode_cache; /* cache of free inodes */ 84 85 /* UFS Inode Cache Stats -- Not protected */ 86 struct instats ins = { 87 { "size", KSTAT_DATA_ULONG }, 88 { "maxsize", KSTAT_DATA_ULONG }, 89 { "hits", KSTAT_DATA_ULONG }, 90 { "misses", KSTAT_DATA_ULONG }, 91 { "kmem allocs", KSTAT_DATA_ULONG }, 92 { "kmem frees", KSTAT_DATA_ULONG }, 93 { "maxsize reached", KSTAT_DATA_ULONG }, 94 { "puts at frontlist", KSTAT_DATA_ULONG }, 95 { "puts at backlist", KSTAT_DATA_ULONG }, 96 { "queues to free", KSTAT_DATA_ULONG }, 97 { "scans", KSTAT_DATA_ULONG }, 98 { "thread idles", KSTAT_DATA_ULONG }, 99 { "lookup idles", KSTAT_DATA_ULONG }, 100 { "vget idles", KSTAT_DATA_ULONG }, 101 { "cache allocs", KSTAT_DATA_ULONG }, 102 { "cache frees", KSTAT_DATA_ULONG }, 103 { "pushes at close", KSTAT_DATA_ULONG } 104 }; 105 106 /* kstat data */ 107 static kstat_t *ufs_inode_kstat = NULL; 108 109 union ihead *ihead; /* inode LRU cache, Chris Maltby */ 110 kmutex_t *ih_lock; /* protect inode cache hash table */ 111 static int ino_hashlen = 4; /* desired average hash chain length */ 112 int inohsz; /* number of buckets in the hash table */ 113 114 kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */ 115 kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */ 116 kmutex_t ufsvfs_mutex; 117 struct ufsvfs *oldufsvfslist, *ufsvfslist; 118 119 /* 120 * time to wait after ufsvfsp->vfs_iotstamp before declaring that no 121 * I/Os are going on. 122 */ 123 clock_t ufs_iowait; 124 125 /* 126 * the threads that process idle inodes and free (deleted) inodes 127 * have high water marks that are set in ufsinit(). 128 * These values but can be no less then the minimum shown below 129 */ 130 int ufs_idle_max; /* # of allowable idle inodes */ 131 ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */ 132 #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */ 133 134 /* 135 * Tunables for ufs write throttling. 136 * These are validated in ufs_iinit() since improper settings 137 * can lead to filesystem hangs. 138 */ 139 #define UFS_HW_DEFAULT (16 * 1024 * 1024) 140 #define UFS_LW_DEFAULT (8 * 1024 * 1024) 141 int ufs_HW = UFS_HW_DEFAULT; 142 int ufs_LW = UFS_LW_DEFAULT; 143 144 static void ihinit(void); 145 extern int hash2ints(int, int); 146 147 static int ufs_iget_internal(struct vfs *, ino_t, struct inode **, 148 struct cred *, int); 149 150 /* ARGSUSED */ 151 static int 152 ufs_inode_kstat_update(kstat_t *ksp, int rw) 153 { 154 if (rw == KSTAT_WRITE) 155 return (EACCES); 156 157 ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 158 "slab_alloc"); 159 ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 160 "slab_free"); 161 ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 162 "alloc"); 163 ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 164 "free"); 165 ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 166 "buf_inuse"); 167 ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache, 168 "buf_max"); 169 ins.in_misses.value.ul = ins.in_kcalloc.value.ul; 170 171 return (0); 172 } 173 174 void 175 ufs_iinit(void) 176 { 177 /* 178 * Validate that ufs_HW > ufs_LW. 179 * The default values for these two tunables have been increased. 180 * There is now a range of values for ufs_HW that used to be 181 * legal on previous Solaris versions but no longer is now. 182 * Upgrading a machine which has an /etc/system setting for ufs_HW 183 * from that range can lead to filesystem hangs unless the values 184 * are checked here. 185 */ 186 if (ufs_HW <= ufs_LW) { 187 cmn_err(CE_WARN, 188 "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.", 189 ufs_HW, ufs_LW); 190 ufs_LW = UFS_LW_DEFAULT; 191 ufs_HW = UFS_HW_DEFAULT; 192 cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n", 193 ufs_HW, ufs_LW); 194 } 195 196 /* 197 * Adjust the tunable `ufs_ninode' to a reasonable value 198 */ 199 if (ufs_ninode <= 0) 200 ufs_ninode = ncsize; 201 if (ufs_inode_max == 0) 202 ufs_inode_max = 203 (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode)); 204 if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) { 205 cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld", 206 ufs_inode_max); 207 ufs_ninode = ufs_inode_max; 208 } 209 /* 210 * Wait till third call of ufs_update to declare that no I/Os are 211 * going on. This allows deferred access times to be flushed to disk. 212 */ 213 ufs_iowait = v.v_autoup * hz * 2; 214 215 /* 216 * idle thread runs when 25% of ufs_ninode entries are on the queue 217 */ 218 if (ufs_idle_max == 0) 219 ufs_idle_max = ufs_ninode >> 2; 220 if (ufs_idle_max < UFS_IDLE_MAX) 221 ufs_idle_max = UFS_IDLE_MAX; 222 if (ufs_idle_max > ufs_ninode) 223 ufs_idle_max = ufs_ninode; 224 /* 225 * This is really a misnomer, it is ufs_queue_init 226 */ 227 ufs_thread_init(&ufs_idle_q, ufs_idle_max); 228 ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL); 229 230 /* 231 * global hlock thread 232 */ 233 ufs_thread_init(&ufs_hlock, 1); 234 ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL); 235 236 ihinit(); 237 qtinit(); 238 ins.in_maxsize.value.ul = ufs_ninode; 239 if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs", 240 KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t), 241 KSTAT_FLAG_VIRTUAL)) != NULL) { 242 ufs_inode_kstat->ks_data = (void *)&ins; 243 ufs_inode_kstat->ks_update = ufs_inode_kstat_update; 244 kstat_install(ufs_inode_kstat); 245 } 246 ufsfx_init(); /* fix-on-panic initialization */ 247 si_cache_init(); 248 ufs_directio_init(); 249 lufs_init(); 250 mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL); 251 } 252 253 /* ARGSUSED */ 254 static int 255 ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags) 256 { 257 struct inode *ip = buf; 258 struct vnode *vp; 259 260 rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL); 261 rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL); 262 mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL); 263 dnlc_dir_init(&ip->i_danchor); 264 265 cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL); 266 267 vp = vn_alloc(KM_SLEEP); 268 ip->i_vnode = vp; 269 270 vn_setops(vp, ufs_vnodeops); 271 vp->v_data = (caddr_t)ip; 272 273 return (0); 274 } 275 276 /* ARGSUSED */ 277 static void 278 ufs_inode_cache_destructor(void *buf, void *cdrarg) 279 { 280 struct inode *ip = buf; 281 struct vnode *vp; 282 283 vp = ITOV(ip); 284 285 rw_destroy(&ip->i_rwlock); 286 rw_destroy(&ip->i_contents); 287 288 mutex_destroy(&ip->i_tlock); 289 if (vp->v_type == VDIR) { 290 dnlc_dir_fini(&ip->i_danchor); 291 } 292 293 cv_destroy(&ip->i_wrcv); 294 295 vn_free(vp); 296 } 297 298 /* 299 * Initialize hash links for inodes 300 * and build inode free list. 301 */ 302 void 303 ihinit(void) 304 { 305 int i; 306 union ihead *ih = ihead; 307 308 mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL); 309 310 inohsz = 1 << highbit(ufs_ninode / ino_hashlen); 311 ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP); 312 ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP); 313 314 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 315 ih->ih_head[0] = ih; 316 ih->ih_head[1] = ih; 317 mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL); 318 } 319 inode_cache = kmem_cache_create("ufs_inode_cache", 320 sizeof (struct inode), 0, ufs_inode_cache_constructor, 321 ufs_inode_cache_destructor, ufs_inode_cache_reclaim, 322 NULL, NULL, 0); 323 } 324 325 /* 326 * Free an inode structure 327 */ 328 void 329 ufs_free_inode(struct inode *ip) 330 { 331 vn_invalid(ITOV(ip)); 332 kmem_cache_free(inode_cache, ip); 333 } 334 335 /* 336 * Allocate an inode structure 337 */ 338 struct inode * 339 ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino) 340 { 341 struct inode *ip; 342 vnode_t *vp; 343 344 ip = kmem_cache_alloc(inode_cache, KM_SLEEP); 345 /* 346 * at this point we have a newly allocated inode 347 */ 348 ip->i_freef = ip; 349 ip->i_freeb = ip; 350 ip->i_flag = IREF; 351 ip->i_seq = 0xFF; /* Unique initial value */ 352 ip->i_dev = ufsvfsp->vfs_dev; 353 ip->i_ufsvfs = ufsvfsp; 354 ip->i_devvp = ufsvfsp->vfs_devvp; 355 ip->i_number = ino; 356 ip->i_diroff = 0; 357 ip->i_nextr = 0; 358 ip->i_map = NULL; 359 ip->i_rdev = 0; 360 ip->i_writes = 0; 361 ip->i_mode = 0; 362 ip->i_delaylen = 0; 363 ip->i_delayoff = 0; 364 ip->i_nextrio = 0; 365 ip->i_ufs_acl = NULL; 366 ip->i_cflags = 0; 367 ip->i_mapcnt = 0; 368 ip->i_dquot = NULL; 369 ip->i_cachedir = CD_ENABLED; 370 ip->i_writer = NULL; 371 372 /* 373 * the vnode for this inode was allocated by the constructor 374 */ 375 vp = ITOV(ip); 376 vn_reinit(vp); 377 if (ino == (ino_t)UFSROOTINO) 378 vp->v_flag = VROOT; 379 vp->v_vfsp = ufsvfsp->vfs_vfs; 380 vn_exists(vp); 381 return (ip); 382 } 383 384 /* 385 * Look up an inode by device, inumber. If it is in core (in the 386 * inode structure), honor the locking protocol. If it is not in 387 * core, read it in from the specified device after freeing any pages. 388 * In all cases, a pointer to a VN_HELD inode structure is returned. 389 */ 390 int 391 ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr) 392 { 393 return (ufs_iget_internal(vfsp, ino, ipp, cr, 0)); 394 } 395 396 /* 397 * A version of ufs_iget which returns only allocated, linked inodes. 398 * This is appropriate for any callers who do not expect a free inode. 399 */ 400 int 401 ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp, 402 struct cred *cr) 403 { 404 return (ufs_iget_internal(vfsp, ino, ipp, cr, 1)); 405 } 406 407 /* 408 * Set vnode attributes based on v_type, this should be called whenever 409 * an inode's i_mode is changed. 410 */ 411 void 412 ufs_reset_vnode(vnode_t *vp) 413 { 414 /* 415 * an old DBE hack 416 */ 417 if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) 418 vp->v_flag |= VSWAPLIKE; 419 else 420 vp->v_flag &= ~VSWAPLIKE; 421 422 /* 423 * if not swap like and it's just a regular file, we want 424 * to maintain the vnode's pages sorted by clean/modified 425 * for faster sync'ing to disk 426 */ 427 if (vp->v_type == VREG) 428 vp->v_flag |= VMODSORT; 429 else 430 vp->v_flag &= ~VMODSORT; 431 432 /* 433 * Is this an attribute hidden dir? 434 */ 435 if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR) 436 vp->v_flag |= V_XATTRDIR; 437 else 438 vp->v_flag &= ~V_XATTRDIR; 439 } 440 441 /* 442 * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate' 443 * flag is used to distinguish the two; when true, we validate that the inode 444 * being retrieved looks like a linked and allocated inode. 445 */ 446 /* ARGSUSED */ 447 static int 448 ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp, 449 struct cred *cr, int validate) 450 { 451 struct inode *ip, *sp; 452 union ihead *ih; 453 kmutex_t *ihm; 454 struct buf *bp; 455 struct dinode *dp; 456 struct vnode *vp; 457 extern vfs_t EIO_vfs; 458 int error; 459 int ftype; /* XXX - Remove later on */ 460 dev_t vfs_dev; 461 struct ufsvfs *ufsvfsp; 462 struct fs *fs; 463 int hno; 464 daddr_t bno; 465 ulong_t ioff; 466 467 CPU_STATS_ADD_K(sys, ufsiget, 1); 468 469 /* 470 * Lookup inode in cache. 471 */ 472 vfs_dev = vfsp->vfs_dev; 473 hno = INOHASH(ino); 474 ih = &ihead[hno]; 475 ihm = &ih_lock[hno]; 476 477 again: 478 mutex_enter(ihm); 479 for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) { 480 if (ino != ip->i_number || vfs_dev != ip->i_dev || 481 (ip->i_flag & ISTALE)) 482 continue; 483 484 /* 485 * Found the interesting inode; hold it and drop the cache lock 486 */ 487 vp = ITOV(ip); /* for locknest */ 488 VN_HOLD(vp); 489 mutex_exit(ihm); 490 rw_enter(&ip->i_contents, RW_READER); 491 492 /* 493 * if necessary, remove from idle list 494 */ 495 if ((ip->i_flag & IREF) == 0) { 496 if (ufs_rmidle(ip)) 497 VN_RELE(vp); 498 } 499 500 /* 501 * Could the inode be read from disk? 502 */ 503 if (ip->i_flag & ISTALE) { 504 rw_exit(&ip->i_contents); 505 VN_RELE(vp); 506 goto again; 507 } 508 509 ins.in_hits.value.ul++; 510 *ipp = ip; 511 512 /* 513 * Reset the vnode's attribute flags 514 */ 515 mutex_enter(&vp->v_lock); 516 ufs_reset_vnode(vp); 517 mutex_exit(&vp->v_lock); 518 519 rw_exit(&ip->i_contents); 520 521 return (0); 522 } 523 mutex_exit(ihm); 524 525 /* 526 * Inode was not in cache. 527 * 528 * Allocate a new entry 529 */ 530 ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; 531 fs = ufsvfsp->vfs_fs; 532 533 ip = ufs_alloc_inode(ufsvfsp, ino); 534 vp = ITOV(ip); 535 536 bno = fsbtodb(fs, itod(fs, ino)); 537 ioff = (sizeof (struct dinode)) * (itoo(fs, ino)); 538 ip->i_doff = (offset_t)ioff + ldbtob(bno); 539 540 /* 541 * put a place holder in the cache (if not already there) 542 */ 543 mutex_enter(ihm); 544 for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw) 545 if (ino == sp->i_number && vfs_dev == sp->i_dev && 546 ((sp->i_flag & ISTALE) == 0)) { 547 mutex_exit(ihm); 548 ufs_free_inode(ip); 549 goto again; 550 } 551 /* 552 * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock)) 553 * here, but if we do, then shadow inode allocations panic the 554 * system. We don't have to hold vfs_dqrwlock for shadow inodes 555 * and the ufs_iget() parameters don't tell us what we are getting 556 * so we have no way of knowing this is a ufs_iget() call from 557 * a ufs_ialloc() call for a shadow inode. 558 */ 559 rw_enter(&ip->i_contents, RW_WRITER); 560 insque(ip, ih); 561 mutex_exit(ihm); 562 /* 563 * read the dinode 564 */ 565 bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize); 566 567 /* 568 * Check I/O errors 569 */ 570 error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0); 571 if (error) { 572 brelse(bp); 573 ip->i_flag |= ISTALE; /* in case someone is looking it up */ 574 rw_exit(&ip->i_contents); 575 vp->v_vfsp = &EIO_vfs; 576 VN_RELE(vp); 577 return (error); 578 } 579 /* 580 * initialize the inode's dinode 581 */ 582 dp = (struct dinode *)(ioff + bp->b_un.b_addr); 583 ip->i_ic = dp->di_ic; /* structure assignment */ 584 brelse(bp); 585 586 /* 587 * Maintain compatibility with Solaris 1.x UFS 588 */ 589 if (ip->i_suid != UID_LONG) 590 ip->i_uid = ip->i_suid; 591 if (ip->i_sgid != GID_LONG) 592 ip->i_gid = ip->i_sgid; 593 594 ftype = ip->i_mode & IFMT; 595 if (ftype == IFBLK || ftype == IFCHR) { 596 dev_t dv; 597 uint_t top16 = ip->i_ordev & 0xffff0000u; 598 599 if (top16 == 0 || top16 == 0xffff0000u) 600 dv = expdev(ip->i_ordev); 601 else 602 dv = expldev(ip->i_ordev); 603 vp->v_rdev = ip->i_rdev = dv; 604 } 605 606 /* 607 * if our caller only expects allocated inodes, verify that 608 * this inode looks good; throw it out if it's bad. 609 */ 610 if (validate) { 611 if ((ftype == 0) || (ip->i_nlink <= 0)) { 612 ip->i_flag |= ISTALE; 613 rw_exit(&ip->i_contents); 614 vp->v_vfsp = &EIO_vfs; 615 VN_RELE(vp); 616 cmn_err(CE_NOTE, 617 "%s: unexpected free inode %d, run fsck(1M)%s", 618 fs->fs_fsmnt, (int)ino, 619 (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); 620 return (EIO); 621 } 622 } 623 624 /* 625 * finish initializing the vnode 626 */ 627 vp->v_type = IFTOVT((mode_t)ip->i_mode); 628 629 ufs_reset_vnode(vp); 630 631 /* 632 * read the shadow 633 */ 634 if (ftype != 0 && ip->i_shadow != 0) { 635 if ((error = ufs_si_load(ip, cr)) != 0) { 636 ip->i_flag |= ISTALE; 637 ip->i_ufs_acl = NULL; 638 rw_exit(&ip->i_contents); 639 vp->v_vfsp = &EIO_vfs; 640 VN_RELE(vp); 641 return (error); 642 } 643 } 644 645 /* 646 * Only attach quota information if the inode has a type and if 647 * that type is not a shadow inode. 648 */ 649 if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) && 650 ((ip->i_mode & IFMT) != IFATTRDIR)) { 651 ip->i_dquot = getinoquota(ip); 652 } 653 TRANS_MATA_IGET(ufsvfsp, ip); 654 *ipp = ip; 655 rw_exit(&ip->i_contents); 656 657 return (0); 658 } 659 660 /* 661 * Vnode is no longer referenced, write the inode out 662 * and if necessary, truncate and deallocate the file. 663 */ 664 void 665 ufs_iinactive(struct inode *ip) 666 { 667 int front; 668 struct inode *iq; 669 struct inode *hip; 670 struct ufs_q *uq; 671 struct vnode *vp = ITOV(ip); 672 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 673 struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; 674 675 /* 676 * Because the vnode type might have been changed, 677 * the dnlc_dir_purge must be called unconditionally. 678 */ 679 dnlc_dir_purge(&ip->i_danchor); 680 681 /* 682 * Get exclusive access to inode data. 683 */ 684 rw_enter(&ip->i_contents, RW_WRITER); 685 ASSERT(ip->i_flag & IREF); 686 687 /* 688 * Make sure no one reclaimed the inode before we put it on 689 * the freelist or destroy it. We keep our 'hold' on the vnode 690 * from vn_rele until we are ready to do something with the inode. 691 * 692 * Pageout may put a VN_HOLD/VN_RELE at anytime during this 693 * operation via an async putpage, so we must make sure 694 * we don't free/destroy the inode more than once. ufs_iget 695 * may also put a VN_HOLD on the inode before it grabs 696 * the i_contents lock. This is done so we don't free 697 * an inode that a thread is waiting on. 698 */ 699 mutex_enter(&vp->v_lock); 700 701 if (vp->v_count > 1) { 702 vp->v_count--; /* release our hold from vn_rele */ 703 mutex_exit(&vp->v_lock); 704 rw_exit(&ip->i_contents); 705 return; 706 } 707 mutex_exit(&vp->v_lock); 708 709 /* 710 * For umount case: if ufsvfs ptr is NULL, the inode is unhashed 711 * and clean. It can be safely destroyed (cyf). 712 */ 713 if (ip->i_ufsvfs == NULL) { 714 rw_exit(&ip->i_contents); 715 ufs_si_del(ip); 716 ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); 717 ufs_free_inode(ip); 718 return; 719 } 720 721 /* 722 * queue idle inode to appropriate thread. Will check v_count == 1 723 * prior to putting this on the appropriate queue. 724 * Stale inodes will be unhashed and freed by the ufs idle thread 725 * in ufs_idle_free() 726 */ 727 front = 1; 728 if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 && 729 ip->i_mode && ip->i_nlink <= 0) { 730 /* 731 * Mark the i_flag to indicate that inode is being deleted. 732 * This flag will be cleared when the deletion is complete. 733 * This prevents nfs from sneaking in via ufs_vget() while 734 * the delete is in progress (bugid 1242481). 735 */ 736 ip->i_flag |= IDEL; 737 738 /* 739 * NOIDEL means that deletes are not allowed at this time; 740 * whoever resets NOIDEL will also send this inode back 741 * through ufs_iinactive. IREF remains set. 742 */ 743 if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { 744 mutex_enter(&vp->v_lock); 745 vp->v_count--; 746 mutex_exit(&vp->v_lock); 747 rw_exit(&ip->i_contents); 748 return; 749 } 750 if (!TRANS_ISTRANS(ip->i_ufsvfs)) { 751 rw_exit(&ip->i_contents); 752 ufs_delete(ip->i_ufsvfs, ip, 0); 753 return; 754 } 755 756 /* queue to delete thread; IREF remains set */ 757 ins.in_qfree.value.ul++; 758 uq = &ip->i_ufsvfs->vfs_delete; 759 760 mutex_enter(&uq->uq_mutex); 761 762 /* add to q */ 763 if ((iq = uq->uq_ihead) != 0) { 764 ip->i_freef = iq; 765 ip->i_freeb = iq->i_freeb; 766 iq->i_freeb->i_freef = ip; 767 iq->i_freeb = ip; 768 if (front) 769 uq->uq_ihead = ip; 770 } else { 771 uq->uq_ihead = ip; 772 ip->i_freef = ip; 773 ip->i_freeb = ip; 774 } 775 776 delq_info->delq_unreclaimed_files += 1; 777 delq_info->delq_unreclaimed_blocks += ip->i_blocks; 778 } else { 779 /* 780 * queue to idle thread 781 * Check the v_count == 1 again. 782 * 783 */ 784 mutex_enter(&vp->v_lock); 785 if (vp->v_count > 1) { 786 vp->v_count--; /* release our hold from vn_rele */ 787 mutex_exit(&vp->v_lock); 788 rw_exit(&ip->i_contents); 789 return; 790 } 791 mutex_exit(&vp->v_lock); 792 uq = &ufs_idle_q; 793 794 /* 795 * useful iff it has pages or is a fastsymlink; otherwise junk 796 */ 797 mutex_enter(&uq->uq_mutex); 798 799 /* clear IREF means `on idle list' */ 800 ip->i_flag &= ~(IREF | IDIRECTIO); 801 802 if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) { 803 ins.in_frback.value.ul++; 804 hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)]; 805 ufs_nuseful_iq++; 806 } else { 807 ins.in_frfront.value.ul++; 808 hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)]; 809 ip->i_flag |= IJUNKIQ; 810 ufs_njunk_iq++; 811 } 812 ip->i_freef = hip; 813 ip->i_freeb = hip->i_freeb; 814 hip->i_freeb->i_freef = ip; 815 hip->i_freeb = ip; 816 } 817 818 /* wakeup thread(s) if q is overfull */ 819 if (++uq->uq_ne == uq->uq_lowat) 820 cv_broadcast(&uq->uq_cv); 821 822 /* all done, release the q and inode */ 823 mutex_exit(&uq->uq_mutex); 824 rw_exit(&ip->i_contents); 825 } 826 827 /* 828 * Check accessed and update flags on an inode structure. 829 * If any are on, update the inode with the (unique) current time. 830 * If waitfor is given, insure I/O order so wait for write to complete. 831 */ 832 void 833 ufs_iupdat(struct inode *ip, int waitfor) 834 { 835 struct buf *bp; 836 struct fs *fp; 837 struct dinode *dp; 838 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 839 int i; 840 int do_trans_times; 841 ushort_t flag; 842 o_uid_t suid; 843 o_gid_t sgid; 844 845 /* 846 * This function is now safe to be called with either the reader 847 * or writer i_contents lock. 848 */ 849 ASSERT(RW_LOCK_HELD(&ip->i_contents)); 850 851 /* 852 * Return if file system has been forcibly umounted. 853 */ 854 if (ufsvfsp == NULL) 855 return; 856 857 flag = ip->i_flag; /* Atomic read */ 858 /* 859 * We better not update the disk inode from a stale inode. 860 */ 861 if (flag & ISTALE) 862 return; 863 864 fp = ip->i_fs; 865 866 if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) { 867 if (fp->fs_ronly) { 868 mutex_enter(&ip->i_tlock); 869 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 870 mutex_exit(&ip->i_tlock); 871 return; 872 } 873 /* 874 * fs is active while metadata is being written 875 */ 876 mutex_enter(&ufsvfsp->vfs_lock); 877 ufs_notclean(ufsvfsp); 878 /* 879 * get the dinode 880 */ 881 bp = UFS_BREAD(ufsvfsp, ip->i_dev, 882 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)), 883 (int)fp->fs_bsize); 884 if (bp->b_flags & B_ERROR) { 885 mutex_enter(&ip->i_tlock); 886 ip->i_flag &= 887 ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 888 mutex_exit(&ip->i_tlock); 889 brelse(bp); 890 return; 891 } 892 /* 893 * munge inode fields 894 */ 895 mutex_enter(&ip->i_tlock); 896 ITIMES_NOLOCK(ip); 897 do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC); 898 ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); 899 mutex_exit(&ip->i_tlock); 900 901 /* 902 * For reads and concurrent re-writes, no deltas were 903 * entered for the access time changes - do it now. 904 */ 905 if (do_trans_times) { 906 TRANS_INODE_TIMES(ufsvfsp, ip); 907 } 908 909 /* 910 * For SunOS 5.0->5.4, these lines below read: 911 * 912 * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; 913 * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; 914 * 915 * where MAXUID was set to 60002. This was incorrect - 916 * the uids should have been constrained to what fitted into 917 * a 16-bit word. 918 * 919 * This means that files from 4.x filesystems that have an 920 * i_suid field larger than 60002 will have that field 921 * changed to 65535. 922 * 923 * Security note: 4.x UFS could never create a i_suid of 924 * UID_LONG since that would've corresponded to -1. 925 */ 926 suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? 927 UID_LONG : ip->i_uid; 928 sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? 929 GID_LONG : ip->i_gid; 930 931 if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) { 932 ip->i_suid = suid; 933 ip->i_sgid = sgid; 934 TRANS_INODE(ufsvfsp, ip); 935 } 936 937 if ((ip->i_mode & IFMT) == IFBLK || 938 (ip->i_mode & IFMT) == IFCHR) { 939 dev_t d = ip->i_rdev; 940 dev32_t dev32; 941 942 /* 943 * load first direct block only if special device 944 */ 945 if (!cmpldev(&dev32, d)) { 946 /* 947 * We panic here because there's "no way" 948 * we should have been able to create a large 949 * inode with a large dev_t. Earlier layers 950 * should've caught this. 951 */ 952 panic("ip %p: i_rdev too big", (void *)ip); 953 } 954 955 if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { 956 ip->i_ordev = dev32; /* can't use old fmt. */ 957 } else { 958 ip->i_ordev = cmpdev(d); 959 } 960 } 961 962 /* 963 * copy inode to dinode (zero fastsymlnk in dinode) 964 */ 965 dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number); 966 dp->di_ic = ip->i_ic; /* structure assignment */ 967 if (flag & IFASTSYMLNK) { 968 for (i = 1; i < NDADDR; i++) 969 dp->di_db[i] = 0; 970 for (i = 0; i < NIADDR; i++) 971 dp->di_ib[i] = 0; 972 } 973 if (TRANS_ISTRANS(ufsvfsp)) { 974 /* 975 * Pass only a sector size buffer containing 976 * the inode, otherwise when the buffer is copied 977 * into a cached roll buffer then too much memory 978 * gets consumed if 8KB inode buffers are passed. 979 */ 980 TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff, 981 sizeof (struct dinode), 982 (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE), 983 DEV_BSIZE); 984 985 brelse(bp); 986 } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) { 987 UFS_BRWRITE(ufsvfsp, bp); 988 989 /* 990 * Synchronous write has guaranteed that inode 991 * has been written on disk so clear the flag 992 */ 993 mutex_enter(&ip->i_tlock); 994 ip->i_flag &= ~IBDWRITE; 995 mutex_exit(&ip->i_tlock); 996 } else { 997 bdrwrite(bp); 998 999 /* 1000 * This write hasn't guaranteed that inode has been 1001 * written on the disk. 1002 * Since, all updat flags on inode are cleared, we must 1003 * remember the condition in case inode is to be updated 1004 * synchronously later (e.g.- fsync()/fdatasync()) 1005 * and inode has not been modified yet. 1006 */ 1007 mutex_enter(&ip->i_tlock); 1008 ip->i_flag |= IBDWRITE; 1009 mutex_exit(&ip->i_tlock); 1010 } 1011 } else { 1012 /* 1013 * In case previous inode update was done asynchronously 1014 * (IBDWRITE) and this inode update request wants guaranteed 1015 * (synchronous) disk update, flush the inode. 1016 */ 1017 if (waitfor && (flag & IBDWRITE)) { 1018 blkflush(ip->i_dev, 1019 (daddr_t)fsbtodb(fp, itod(fp, ip->i_number))); 1020 mutex_enter(&ip->i_tlock); 1021 ip->i_flag &= ~IBDWRITE; 1022 mutex_exit(&ip->i_tlock); 1023 } 1024 } 1025 } 1026 1027 #define SINGLE 0 /* index of single indirect block */ 1028 #define DOUBLE 1 /* index of double indirect block */ 1029 #define TRIPLE 2 /* index of triple indirect block */ 1030 1031 /* 1032 * Release blocks associated with the inode ip and 1033 * stored in the indirect block bn. Blocks are free'd 1034 * in LIFO order up to (but not including) lastbn. If 1035 * level is greater than SINGLE, the block is an indirect 1036 * block and recursive calls to indirtrunc must be used to 1037 * cleanse other indirect blocks. 1038 * 1039 * N.B.: triple indirect blocks are untested. 1040 */ 1041 static long 1042 indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags) 1043 { 1044 int i; 1045 struct buf *bp, *copy; 1046 daddr32_t *bap; 1047 struct ufsvfs *ufsvfsp = ip->i_ufsvfs; 1048 struct fs *fs = ufsvfsp->vfs_fs; 1049 daddr_t nb, last; 1050 long factor; 1051 int blocksreleased = 0, nblocks; 1052 1053 ASSERT(RW_WRITE_HELD(&ip->i_contents)); 1054 /* 1055 * Calculate index in current block of last 1056 * block to be kept. -1 indicates the entire 1057 * block so we need not calculate the index. 1058 */ 1059 factor = 1; 1060 for (i = SINGLE; i < level; i++) 1061 factor *= NINDIR(fs); 1062 last = lastbn; 1063 if (lastbn > 0) 1064 last /= factor; 1065 nblocks = btodb(fs->fs_bsize); 1066 /* 1067 * Get buffer of block pointers, zero those 1068 * entries corresponding to blocks to be free'd, 1069 * and update on disk copy first. 1070 * *Unless* the root pointer has been synchronously 1071 * written to disk. If nothing points to this 1072 * indirect block then don't bother zero'ing and 1073 * writing it. 1074 */ 1075 bp = UFS_BREAD(ufsvfsp, 1076 ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize); 1077 if (bp->b_flags & B_ERROR) { 1078 brelse(bp); 1079 return (0); 1080 } 1081 bap = bp->b_un.b_daddr; 1082 if ((flags & I_CHEAP) == 0) { 1083 uint_t zb; 1084 1085 zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t)); 1086 1087 if (zb) { 1088 /* 1089 * push any data into the log before we zero it 1090 */ 1091 if (bp->b_flags & B_DELWRI) 1092 TRANS_LOG(ufsvfsp, (caddr_t)bap, 1093 ldbtob(bp->b_blkno), bp->b_bcount, 1094 bp->b_un.b_addr, bp->b_bcount); 1095 copy = ngeteblk(fs->fs_bsize); 1096 bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr, 1097 (uint_t)fs->fs_bsize); 1098 bzero((caddr_t)&bap[last + 1], zb); 1099 1100 TRANS_BUF(ufsvfsp, 1101 (caddr_t)&bap[last + 1] - (caddr_t)bap, 1102 zb, bp, DT_ABZERO); 1103 1104 UFS_BRWRITE(ufsvfsp, bp); 1105 bp = copy, bap = bp->b_un.b_daddr; 1106 } 1107 } else { 1108 /* make sure write retries are also cleared */ 1109 bp->b_flags &= ~(B_DELWRI | B_RETRYWRI); 1110 bp->b_flags |= B_STALE | B_AGE; 1111 } 1112 1113 /* 1114 * Recursively free totally unused blocks. 1115 */ 1116 flags |= I_CHEAP; 1117 for (i = NINDIR(fs) - 1; i > last; i--) { 1118 nb = bap[i]; 1119 if (nb == 0) 1120 continue; 1121 if (level > SINGLE) { 1122 blocksreleased += 1123 indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags); 1124 free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK); 1125 } else 1126 free(ip, nb, (off_t)fs->fs_bsize, flags); 1127 blocksreleased += nblocks; 1128 } 1129 flags &= ~I_CHEAP; 1130 1131 /* 1132 * Recursively free last partial block. 1133 */ 1134 if (level > SINGLE && lastbn >= 0) { 1135 last = lastbn % factor; 1136 nb = bap[i]; 1137 if (nb != 0) 1138 blocksreleased += 1139 indirtrunc(ip, nb, last, level - 1, flags); 1140 } 1141 brelse(bp); 1142 return (blocksreleased); 1143 } 1144 1145 /* 1146 * Truncate the inode ip to at most length size. 1147 * Free affected disk blocks -- the blocks of the 1148 * file are removed in reverse order. 1149 * 1150 * N.B.: triple indirect blocks are untested. 1151 */ 1152 static int i_genrand = 1234; 1153 int 1154 ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr) 1155 { 1156 struct fs *fs = oip->i_fs; 1157 struct ufsvfs *ufsvfsp = oip->i_ufsvfs; 1158 struct inode *ip; 1159 daddr_t lastblock; 1160 off_t bsize; 1161 int boff; 1162 daddr_t bn, lastiblock[NIADDR]; 1163 int level; 1164 long nblocks, blocksreleased = 0; 1165 int i; 1166 ushort_t mode; 1167 struct inode tip; 1168 int err; 1169 u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ? 1170 (UFS_MAXOFFSET_T) : (MAXOFF32_T); 1171 1172 /* 1173 * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most 1174 * other uses need the reader lock. opendq() holds the writer lock. 1175 */ 1176 ASSERT((oip->i_mode & IFMT) == IFSHAD || 1177 RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); 1178 ASSERT(RW_WRITE_HELD(&oip->i_contents)); 1179 /* 1180 * We only allow truncation of regular files and directories 1181 * to arbitrary lengths here. In addition, we allow symbolic 1182 * links to be truncated only to zero length. Other inode 1183 * types cannot have their length set here. Disk blocks are 1184 * being dealt with - especially device inodes where 1185 * ip->i_ordev is actually being stored in ip->i_db[0]! 1186 */ 1187 TRANS_INODE(ufsvfsp, oip); 1188 mode = oip->i_mode & IFMT; 1189 if (flags & I_FREE) { 1190 i_genrand *= 16843009; /* turns into shift and adds */ 1191 i_genrand++; 1192 oip->i_gen += ((i_genrand + lbolt) & 0xffff) + 1; 1193 oip->i_flag |= ICHG |IUPD; 1194 oip->i_seq++; 1195 if (length == oip->i_size) 1196 return (0); 1197 flags |= I_CHEAP; 1198 } 1199 if (mode == IFIFO) 1200 return (0); 1201 if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR && 1202 !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD) 1203 return (EINVAL); 1204 if (length > maxoffset) 1205 return (EFBIG); 1206 if ((mode == IFDIR) || (mode == IFATTRDIR)) 1207 flags |= I_DIR; 1208 if (mode == IFSHAD) 1209 flags |= I_SHAD; 1210 if (oip == ufsvfsp->vfs_qinod) 1211 flags |= I_QUOTA; 1212 if (length == oip->i_size) { 1213 /* update ctime and mtime to please POSIX tests */ 1214 oip->i_flag |= ICHG |IUPD; 1215 oip->i_seq++; 1216 if (length == 0) { 1217 /* nothing to cache so clear the flag */ 1218 oip->i_flag &= ~IFASTSYMLNK; 1219 } 1220 return (0); 1221 } 1222 /* wipe out fast symlink till next access */ 1223 if (oip->i_flag & IFASTSYMLNK) { 1224 int j; 1225 1226 ASSERT(ITOV(oip)->v_type == VLNK); 1227 1228 oip->i_flag &= ~IFASTSYMLNK; 1229 1230 for (j = 1; j < NDADDR; j++) 1231 oip->i_db[j] = 0; 1232 for (j = 0; j < NIADDR; j++) 1233 oip->i_ib[j] = 0; 1234 } 1235 1236 boff = (int)blkoff(fs, length); 1237 1238 if (length > oip->i_size) { 1239 /* 1240 * Trunc up case. BMAPALLOC will insure that the right blocks 1241 * are allocated. This includes extending the old frag to a 1242 * full block (if needed) in addition to doing any work 1243 * needed for allocating the last block. 1244 */ 1245 if (boff == 0) 1246 err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr); 1247 else 1248 err = BMAPALLOC(oip, length - 1, boff, cr); 1249 1250 if (err == 0) { 1251 /* 1252 * Save old size and set inode's size now 1253 * so that we don't cause too much of the 1254 * file to be zero'd and pushed. 1255 */ 1256 u_offset_t osize = oip->i_size; 1257 oip->i_size = length; 1258 /* 1259 * Make sure we zero out the remaining bytes of 1260 * the page in case a mmap scribbled on it. We 1261 * can't prevent a mmap from writing beyond EOF 1262 * on the last page of a file. 1263 * 1264 */ 1265 if ((boff = (int)blkoff(fs, osize)) != 0) { 1266 bsize = (int)lblkno(fs, osize - 1) >= NDADDR ? 1267 fs->fs_bsize : fragroundup(fs, boff); 1268 pvn_vpzero(ITOV(oip), osize, 1269 (size_t)(bsize - boff)); 1270 } 1271 oip->i_flag |= ICHG|IATTCHG; 1272 oip->i_seq++; 1273 ITIMES_NOLOCK(oip); 1274 /* 1275 * MAXOFF32_T is old 2GB size limit. If 1276 * this operation caused a large file to be 1277 * created, turn on the superblock flag 1278 * and update the superblock, if the flag 1279 * is not already on. 1280 */ 1281 if ((length > (u_offset_t)MAXOFF32_T) && 1282 !(fs->fs_flags & FSLARGEFILES)) { 1283 ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); 1284 mutex_enter(&ufsvfsp->vfs_lock); 1285 fs->fs_flags |= FSLARGEFILES; 1286 ufs_sbwrite(ufsvfsp); 1287 mutex_exit(&ufsvfsp->vfs_lock); 1288 } 1289 } 1290 1291 return (err); 1292 } 1293 1294 /* 1295 * Update the pages of the file. If the file is not being 1296 * truncated to a block boundary, the contents of the 1297 * pages following the end of the file must be zero'ed 1298 * in case it ever become accessible again because 1299 * of subsequent file growth. 1300 */ 1301 if (boff == 0) { 1302 (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage, 1303 B_INVAL | B_TRUNC, CRED()); 1304 } else { 1305 /* 1306 * Make sure that the last block is properly allocated. 1307 * We only really have to do this if the last block is 1308 * actually allocated since ufs_bmap will now handle the case 1309 * of an fragment which has no block allocated. Just to 1310 * be sure, we do it now independent of current allocation. 1311 */ 1312 err = BMAPALLOC(oip, length - 1, boff, cr); 1313 if (err) 1314 return (err); 1315 1316 /* 1317 * BMAPALLOC will call bmap_write which defers i_seq 1318 * processing. If the timestamps were changed, update 1319 * i_seq before rdip drops i_contents or syncs the inode. 1320 */ 1321 if (oip->i_flag & (ICHG|IUPD)) 1322 oip->i_seq++; 1323 1324 /* 1325 * BugId 4069932 1326 * Make sure that the relevant partial page appears in 1327 * the v_pages list, so that pvn_vpzero() will do its 1328 * job. Since doing this correctly requires everything 1329 * in rdip() except for the uiomove(), it's easier and 1330 * safer to do the uiomove() rather than duplicate the 1331 * rest of rdip() here. 1332 * 1333 * To get here, we know that length indicates a byte 1334 * that is not the first byte of a block. (length - 1) 1335 * is the last actual byte known to exist. Deduction 1336 * shows it is in the same block as byte (length). 1337 * Thus, this rdip() invocation should always succeed 1338 * except in the face of i/o errors, and give us the 1339 * block we care about. 1340 * 1341 * rdip() makes the same locking assertions and 1342 * assumptions as we do. We do not acquire any locks 1343 * before calling it, so we have not changed the locking 1344 * situation. Finally, there do not appear to be any 1345 * paths whereby rdip() ends up invoking us again. 1346 * Thus, infinite recursion is avoided. 1347 */ 1348 { 1349 uio_t uio; 1350 iovec_t iov[1]; 1351 char buffer; 1352 1353 uio.uio_iov = iov; 1354 uio.uio_iovcnt = 1; 1355 uio.uio_loffset = length - 1; 1356 uio.uio_resid = 1; 1357 uio.uio_segflg = UIO_SYSSPACE; 1358 uio.uio_extflg = UIO_COPY_CACHED; 1359 1360 iov[0].iov_base = &buffer; 1361 iov[0].iov_len = 1; 1362 1363 err = rdip(oip, &uio, UIO_READ, NULL); 1364 if (err) 1365 return (err); 1366 } 1367 1368 bsize = (int)lblkno(fs, length - 1) >= NDADDR ? 1369 fs->fs_bsize : fragroundup(fs, boff); 1370 pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff)); 1371 /* 1372 * Ensure full fs block is marked as dirty. 1373 */ 1374 (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff), 1375 ufs_putapage, B_INVAL | B_TRUNC, CRED()); 1376 } 1377 1378 /* 1379 * Calculate index into inode's block list of 1380 * last direct and indirect blocks (if any) 1381 * which we want to keep. Lastblock is -1 when 1382 * the file is truncated to 0. 1383 */ 1384 lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; 1385 lastiblock[SINGLE] = lastblock - NDADDR; 1386 lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); 1387 lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); 1388 nblocks = btodb(fs->fs_bsize); 1389 1390 /* 1391 * Update file and block pointers 1392 * on disk before we start freeing blocks. 1393 * If we crash before free'ing blocks below, 1394 * the blocks will be returned to the free list. 1395 * lastiblock values are also normalized to -1 1396 * for calls to indirtrunc below. 1397 */ 1398 tip = *oip; /* structure copy */ 1399 ip = &tip; 1400 1401 for (level = TRIPLE; level >= SINGLE; level--) 1402 if (lastiblock[level] < 0) { 1403 oip->i_ib[level] = 0; 1404 lastiblock[level] = -1; 1405 } 1406 for (i = NDADDR - 1; i > lastblock; i--) { 1407 oip->i_db[i] = 0; 1408 flags |= I_CHEAP; 1409 } 1410 oip->i_size = length; 1411 oip->i_flag |= ICHG|IUPD|IATTCHG; 1412 oip->i_seq++; 1413 if (!TRANS_ISTRANS(ufsvfsp)) 1414 ufs_iupdat(oip, I_SYNC); /* do sync inode update */ 1415 1416 /* 1417 * Indirect blocks first. 1418 */ 1419 for (level = TRIPLE; level >= SINGLE; level--) { 1420 bn = ip->i_ib[level]; 1421 if (bn != 0) { 1422 blocksreleased += 1423 indirtrunc(ip, bn, lastiblock[level], level, flags); 1424 if (lastiblock[level] < 0) { 1425 ip->i_ib[level] = 0; 1426 free(ip, bn, (off_t)fs->fs_bsize, 1427 flags | I_IBLK); 1428 blocksreleased += nblocks; 1429 } 1430 } 1431 if (lastiblock[level] >= 0) 1432 goto done; 1433 } 1434 1435 /* 1436 * All whole direct blocks or frags. 1437 */ 1438 for (i = NDADDR - 1; i > lastblock; i--) { 1439 bn = ip->i_db[i]; 1440 if (bn == 0) 1441 continue; 1442 ip->i_db[i] = 0; 1443 bsize = (off_t)blksize(fs, ip, i); 1444 free(ip, bn, bsize, flags); 1445 blocksreleased += btodb(bsize); 1446 } 1447 if (lastblock < 0) 1448 goto done; 1449 1450 /* 1451 * Finally, look for a change in size of the 1452 * last direct block; release any frags. 1453 */ 1454 bn = ip->i_db[lastblock]; 1455 if (bn != 0) { 1456 off_t oldspace, newspace; 1457 1458 /* 1459 * Calculate amount of space we're giving 1460 * back as old block size minus new block size. 1461 */ 1462 oldspace = blksize(fs, ip, lastblock); 1463 UFS_SET_ISIZE(length, ip); 1464 newspace = blksize(fs, ip, lastblock); 1465 if (newspace == 0) { 1466 err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0"); 1467 return (err); 1468 } 1469 if (oldspace - newspace > 0) { 1470 /* 1471 * Block number of space to be free'd is 1472 * the old block # plus the number of frags 1473 * required for the storage we're keeping. 1474 */ 1475 bn += numfrags(fs, newspace); 1476 free(ip, bn, oldspace - newspace, flags); 1477 blocksreleased += btodb(oldspace - newspace); 1478 } 1479 } 1480 done: 1481 /* BEGIN PARANOIA */ 1482 for (level = SINGLE; level <= TRIPLE; level++) 1483 if (ip->i_ib[level] != oip->i_ib[level]) { 1484 err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block"); 1485 return (err); 1486 } 1487 1488 for (i = 0; i < NDADDR; i++) 1489 if (ip->i_db[i] != oip->i_db[i]) { 1490 err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block"); 1491 return (err); 1492 } 1493 /* END PARANOIA */ 1494 oip->i_blocks -= blocksreleased; 1495 1496 if (oip->i_blocks < 0) { /* sanity */ 1497 cmn_err(CE_NOTE, 1498 "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n", 1499 fs->fs_fsmnt, (int)oip->i_number, oip->i_size, 1500 (int)oip->i_blocks); 1501 oip->i_blocks = 0; 1502 } 1503 oip->i_flag |= ICHG|IATTCHG; 1504 oip->i_seq++; 1505 /* blocksreleased is >= zero, so this can not fail */ 1506 (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL, 1507 (size_t *)NULL); 1508 return (0); 1509 } 1510 1511 /* 1512 * Check mode permission on inode. Mode is READ, WRITE or EXEC. 1513 * In the case of WRITE, the read-only status of the file system 1514 * is checked. Depending on the calling user, the appropriate 1515 * mode bits are selected; privileges to override missing permission 1516 * bits are checked through secpolicy_vnode_access(). 1517 */ 1518 int 1519 ufs_iaccess(void *vip, int mode, struct cred *cr) 1520 { 1521 struct inode *ip = vip; 1522 int shift = 0; 1523 1524 if (mode & IWRITE) { 1525 /* 1526 * Disallow write attempts on read-only 1527 * file systems, unless the file is a block 1528 * or character device or a FIFO. 1529 */ 1530 if (ip->i_fs->fs_ronly != 0) { 1531 if ((ip->i_mode & IFMT) != IFCHR && 1532 (ip->i_mode & IFMT) != IFBLK && 1533 (ip->i_mode & IFMT) != IFIFO) { 1534 return (EROFS); 1535 } 1536 } 1537 } 1538 /* 1539 * If there is a shadow inode check for the presence of an acl, 1540 * if the acl is there use the ufs_acl_access routine to check 1541 * the acl 1542 */ 1543 if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) 1544 return (ufs_acl_access(ip, mode, cr)); 1545 1546 /* 1547 * Access check is based on only 1548 * one of owner, group, public. 1549 * If not owner, then check group. 1550 * If not a member of the group, then 1551 * check public access. 1552 */ 1553 if (crgetuid(cr) != ip->i_uid) { 1554 shift += 3; 1555 if (!groupmember((uid_t)ip->i_gid, cr)) 1556 shift += 3; 1557 } 1558 1559 mode &= ~(ip->i_mode << shift); 1560 1561 if (mode == 0) 1562 return (0); 1563 1564 /* test missing privilege bits */ 1565 return (secpolicy_vnode_access(cr, ITOV(ip), ip->i_uid, mode)); 1566 } 1567 1568 /* 1569 * if necessary, remove an inode from the free list 1570 * i_contents is held except at unmount 1571 * 1572 * Return 1 if the inode is taken off of the ufs_idle_q, 1573 * and the caller is expected to call VN_RELE. 1574 * 1575 * Return 0 otherwise. 1576 */ 1577 int 1578 ufs_rmidle(struct inode *ip) 1579 { 1580 int rval = 0; 1581 1582 mutex_enter(&ip->i_tlock); 1583 if ((ip->i_flag & IREF) == 0) { 1584 mutex_enter(&ufs_idle_q.uq_mutex); 1585 ip->i_freef->i_freeb = ip->i_freeb; 1586 ip->i_freeb->i_freef = ip->i_freef; 1587 ip->i_freef = ip; 1588 ip->i_freeb = ip; 1589 ip->i_flag |= IREF; 1590 ufs_idle_q.uq_ne--; 1591 if (ip->i_flag & IJUNKIQ) { 1592 ufs_njunk_iq--; 1593 ip->i_flag &= ~IJUNKIQ; 1594 } else { 1595 ufs_nuseful_iq--; 1596 } 1597 mutex_exit(&ufs_idle_q.uq_mutex); 1598 rval = 1; 1599 } 1600 mutex_exit(&ip->i_tlock); 1601 return (rval); 1602 } 1603 1604 /* 1605 * scan the hash of inodes and call func with the inode locked 1606 */ 1607 int 1608 ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg, 1609 struct ufsvfs *ufsvfsp) 1610 { 1611 struct inode *ip; /* current inode */ 1612 struct inode *lip = NULL; /* last/previous inode */ 1613 union ihead *ih; /* current hash chain */ 1614 int error, i; 1615 int saverror = 0; 1616 int lip_held; /* lip needs a VN_RELE() */ 1617 1618 /* 1619 * If ufsvfsp is NULL, then our caller should be holding 1620 * ufs_scan_lock to avoid conflicts between ufs_unmount() and 1621 * ufs_update(). Otherwise, to avoid false-positives in 1622 * ufs_unmount()'s v_count-based EBUSY check, we only hold 1623 * those inodes that are in the file system our caller cares 1624 * about. 1625 * 1626 * We know that ip is a valid inode in the hash chain (and thus 1627 * we can trust i_ufsvfs) because the inode we chained from 1628 * (lip) is still in the hash chain. This is true because either: 1629 * 1630 * 1. We did not drop the hash chain lock since the last 1631 * iteration (because we were not interested in the last inode), 1632 * or 1633 * 2. We maintained a hold on the last inode while we 1634 * we were processing it, so it could not be removed 1635 * from the hash chain. 1636 * 1637 * The whole reason we're dropping and re-grabbing the chain 1638 * lock on every inode is so that we don't present a major 1639 * choke point on throughput, particularly when we've been 1640 * called on behalf of fsflush. 1641 */ 1642 1643 for (i = 0, ih = ihead; i < inohsz; i++, ih++) { 1644 mutex_enter(&ih_lock[i]); 1645 for (ip = ih->ih_chain[0], lip_held = 0; 1646 ip != (struct inode *)ih; 1647 ip = lip->i_forw) { 1648 1649 ins.in_scan.value.ul++; 1650 1651 /* 1652 * Undo the previous iteration's VN_HOLD(), but 1653 * only if one was done. 1654 */ 1655 if (lip_held) 1656 VN_RELE(ITOV(lip)); 1657 1658 lip = ip; 1659 if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) { 1660 /* 1661 * We're not processing all inodes, and 1662 * this inode is not in the filesystem of 1663 * interest, so skip it. No need to do a 1664 * VN_HOLD() since we're not dropping the 1665 * hash chain lock until after we've 1666 * done the i_forw traversal above. 1667 */ 1668 lip_held = 0; 1669 continue; 1670 } 1671 VN_HOLD(ITOV(ip)); 1672 lip_held = 1; 1673 mutex_exit(&ih_lock[i]); 1674 1675 /* 1676 * Acquire the contents lock as writer to make 1677 * sure that the inode has been initialized in 1678 * the cache or removed from the idle list by 1679 * ufs_iget(). This works because ufs_iget() 1680 * acquires the contents lock before putting 1681 * the inode into the cache. If we can lock 1682 * it, then he's done with it. 1683 */ 1684 1685 if (rwtry) { 1686 if (!rw_tryenter(&ip->i_contents, RW_WRITER)) { 1687 mutex_enter(&ih_lock[i]); 1688 continue; 1689 } 1690 } else { 1691 rw_enter(&ip->i_contents, RW_WRITER); 1692 } 1693 1694 rw_exit(&ip->i_contents); 1695 1696 /* 1697 * ISTALE means the inode couldn't be read 1698 * 1699 * We don't have to hold the i_contents lock 1700 * for this check for a couple of 1701 * reasons. First, if ISTALE is set then the 1702 * flag cannot be cleared until the inode is 1703 * removed from the cache and that cannot 1704 * happen until after we VN_RELE() it. 1705 * Second, if ISTALE is not set, then the 1706 * inode is in the cache and does not need to 1707 * be read from disk so ISTALE cannot be set 1708 * while we are not looking. 1709 */ 1710 if ((ip->i_flag & ISTALE) == 0) { 1711 if ((error = (*func)(ip, arg)) != 0) 1712 saverror = error; 1713 } 1714 1715 mutex_enter(&ih_lock[i]); 1716 } 1717 if (lip_held) 1718 VN_RELE(ITOV(lip)); 1719 mutex_exit(&ih_lock[i]); 1720 } 1721 return (saverror); 1722 } 1723 1724 /* 1725 * Mark inode with the current time, plus a unique increment. 1726 * 1727 * Since we only keep 32-bit time on disk, if UFS is still alive 1728 * beyond 2038, filesystem times will simply stick at the last 1729 * possible second of 32-bit time. Not ideal, but probably better 1730 * than going into the remote past, or confusing applications with 1731 * negative time. 1732 */ 1733 void 1734 ufs_imark(struct inode *ip) 1735 { 1736 timestruc_t now; 1737 int32_t usec, nsec; 1738 1739 /* 1740 * The update of i_seq may have been deferred, increase i_seq here 1741 * to make sure it is in sync with the timestamps. 1742 */ 1743 if (ip->i_flag & ISEQ) { 1744 ASSERT(ip->i_flag & (IUPD|ICHG)); 1745 ip->i_seq++; 1746 ip->i_flag &= ~ISEQ; 1747 } 1748 1749 gethrestime(&now); 1750 1751 /* 1752 * Fast algorithm to convert nsec to usec -- see hrt2ts() 1753 * in common/os/timers.c for a full description. 1754 */ 1755 nsec = now.tv_nsec; 1756 usec = nsec + (nsec >> 2); 1757 usec = nsec + (usec >> 1); 1758 usec = nsec + (usec >> 2); 1759 usec = nsec + (usec >> 4); 1760 usec = nsec - (usec >> 3); 1761 usec = nsec + (usec >> 2); 1762 usec = nsec + (usec >> 3); 1763 usec = nsec + (usec >> 4); 1764 usec = nsec + (usec >> 1); 1765 usec = nsec + (usec >> 6); 1766 usec = usec >> 10; 1767 1768 mutex_enter(&ufs_iuniqtime_lock); 1769 if (now.tv_sec > (time_t)iuniqtime.tv_sec || 1770 usec > iuniqtime.tv_usec) { 1771 if (now.tv_sec < TIME32_MAX) { 1772 iuniqtime.tv_sec = (time32_t)now.tv_sec; 1773 iuniqtime.tv_usec = usec; 1774 } 1775 } else { 1776 if (iuniqtime.tv_sec < TIME32_MAX) { 1777 iuniqtime.tv_usec++; 1778 /* Check for usec overflow */ 1779 if (iuniqtime.tv_usec >= MICROSEC) { 1780 iuniqtime.tv_sec++; 1781 iuniqtime.tv_usec = 0; 1782 } 1783 } 1784 } 1785 1786 if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) { 1787 ip->i_atime = iuniqtime; 1788 } 1789 if (ip->i_flag & IUPD) { 1790 ip->i_mtime = iuniqtime; 1791 ip->i_flag |= IMODTIME; 1792 } 1793 if (ip->i_flag & ICHG) { 1794 ip->i_diroff = 0; 1795 ip->i_ctime = iuniqtime; 1796 } 1797 mutex_exit(&ufs_iuniqtime_lock); 1798 } 1799 1800 /* 1801 * Update timestamps in inode. 1802 */ 1803 void 1804 ufs_itimes_nolock(struct inode *ip) 1805 { 1806 1807 /* 1808 * if noatime is set and the inode access time is the only field that 1809 * must be changed, exit immediately. 1810 */ 1811 if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) && 1812 (ip->i_ufsvfs->vfs_noatime)) { 1813 return; 1814 } 1815 1816 if (ip->i_flag & (IUPD|IACC|ICHG)) { 1817 if (ip->i_flag & ICHG) 1818 ip->i_flag |= IMOD; 1819 else 1820 ip->i_flag |= IMODACC; 1821 ufs_imark(ip); 1822 ip->i_flag &= ~(IACC|IUPD|ICHG); 1823 } 1824 } 1825