/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/types.h> #include <sys/t_lock.h> #include <sys/param.h> #include <sys/systm.h> #include <sys/uio.h> #include <sys/bitmap.h> #include <sys/signal.h> #include <sys/cred.h> #include <sys/user.h> #include <sys/vfs.h> #include <sys/stat.h> #include <sys/vnode.h> #include <sys/buf.h> #include <sys/proc.h> #include <sys/disp.h> #include <sys/dnlc.h> #include <sys/mode.h> #include <sys/cmn_err.h> #include <sys/kstat.h> #include <sys/acl.h> #include <sys/var.h> #include <sys/fs/ufs_inode.h> #include <sys/fs/ufs_fs.h> #include <sys/fs/ufs_trans.h> #include <sys/fs/ufs_acl.h> #include <sys/fs/ufs_bio.h> #include <sys/fs/ufs_quota.h> #include <sys/fs/ufs_log.h> #include <vm/hat.h> #include <vm/as.h> #include <vm/pvn.h> #include <vm/seg.h> #include <sys/swap.h> #include <sys/cpuvar.h> #include <sys/sysmacros.h> #include <sys/errno.h> #include <sys/kmem.h> #include <sys/debug.h> #include <fs/fs_subr.h> #include <sys/policy.h> struct kmem_cache *inode_cache; /* cache of free inodes */ /* UFS Inode Cache Stats -- Not protected */ struct instats ins = { { "size", KSTAT_DATA_ULONG }, { "maxsize", KSTAT_DATA_ULONG }, { "hits", KSTAT_DATA_ULONG }, { "misses", KSTAT_DATA_ULONG }, { "kmem allocs", KSTAT_DATA_ULONG }, { "kmem frees", KSTAT_DATA_ULONG }, { "maxsize reached", KSTAT_DATA_ULONG }, { "puts at frontlist", KSTAT_DATA_ULONG }, { "puts at backlist", KSTAT_DATA_ULONG }, { "queues to free", KSTAT_DATA_ULONG }, { "scans", KSTAT_DATA_ULONG }, { "thread idles", KSTAT_DATA_ULONG }, { "lookup idles", KSTAT_DATA_ULONG }, { "vget idles", KSTAT_DATA_ULONG }, { "cache allocs", KSTAT_DATA_ULONG }, { "cache frees", KSTAT_DATA_ULONG }, { "pushes at close", KSTAT_DATA_ULONG } }; /* kstat data */ static kstat_t *ufs_inode_kstat = NULL; union ihead *ihead; /* inode LRU cache, Chris Maltby */ kmutex_t *ih_lock; /* protect inode cache hash table */ static int ino_hashlen = 4; /* desired average hash chain length */ int inohsz; /* number of buckets in the hash table */ kmutex_t ufs_scan_lock; /* stop racing multiple ufs_scan_inodes() */ kmutex_t ufs_iuniqtime_lock; /* protect iuniqtime */ kmutex_t ufsvfs_mutex; struct ufsvfs *oldufsvfslist, *ufsvfslist; /* * time to wait after ufsvfsp->vfs_iotstamp before declaring that no * I/Os are going on. */ clock_t ufs_iowait; /* * the threads that process idle inodes and free (deleted) inodes * have high water marks that are set in ufsinit(). * These values but can be no less then the minimum shown below */ int ufs_idle_max; /* # of allowable idle inodes */ ulong_t ufs_inode_max; /* hard limit of allowable idle inodes */ #define UFS_IDLE_MAX (16) /* min # of allowable idle inodes */ /* * Tunables for ufs write throttling. * These are validated in ufs_iinit() since improper settings * can lead to filesystem hangs. */ #define UFS_HW_DEFAULT (16 * 1024 * 1024) #define UFS_LW_DEFAULT (8 * 1024 * 1024) int ufs_HW = UFS_HW_DEFAULT; int ufs_LW = UFS_LW_DEFAULT; static void ihinit(void); extern int hash2ints(int, int); static int ufs_iget_internal(struct vfs *, ino_t, struct inode **, struct cred *, int); /* ARGSUSED */ static int ufs_inode_kstat_update(kstat_t *ksp, int rw) { if (rw == KSTAT_WRITE) return (EACCES); ins.in_malloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, "slab_alloc"); ins.in_mfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, "slab_free"); ins.in_kcalloc.value.ul = (ulong_t)kmem_cache_stat(inode_cache, "alloc"); ins.in_kcfree.value.ul = (ulong_t)kmem_cache_stat(inode_cache, "free"); ins.in_size.value.ul = (ulong_t)kmem_cache_stat(inode_cache, "buf_inuse"); ins.in_maxreached.value.ul = (ulong_t)kmem_cache_stat(inode_cache, "buf_max"); ins.in_misses.value.ul = ins.in_kcalloc.value.ul; return (0); } void ufs_iinit(void) { /* * Validate that ufs_HW > ufs_LW. * The default values for these two tunables have been increased. * There is now a range of values for ufs_HW that used to be * legal on previous Solaris versions but no longer is now. * Upgrading a machine which has an /etc/system setting for ufs_HW * from that range can lead to filesystem hangs unless the values * are checked here. */ if (ufs_HW <= ufs_LW) { cmn_err(CE_WARN, "ufs_HW (%d) <= ufs_LW (%d). Check /etc/system.", ufs_HW, ufs_LW); ufs_LW = UFS_LW_DEFAULT; ufs_HW = UFS_HW_DEFAULT; cmn_err(CE_CONT, "using defaults, ufs_HW = %d, ufs_LW = %d\n", ufs_HW, ufs_LW); } /* * Adjust the tunable `ufs_ninode' to a reasonable value */ if (ufs_ninode <= 0) ufs_ninode = ncsize; if (ufs_inode_max == 0) ufs_inode_max = (ulong_t)((kmem_maxavail() >> 2) / sizeof (struct inode)); if (ufs_ninode > ufs_inode_max || (ufs_ninode == 0 && ncsize == 0)) { cmn_err(CE_NOTE, "setting ufs_ninode to max value of %ld", ufs_inode_max); ufs_ninode = ufs_inode_max; } /* * Wait till third call of ufs_update to declare that no I/Os are * going on. This allows deferred access times to be flushed to disk. */ ufs_iowait = v.v_autoup * hz * 2; /* * idle thread runs when 25% of ufs_ninode entries are on the queue */ if (ufs_idle_max == 0) ufs_idle_max = ufs_ninode >> 2; if (ufs_idle_max < UFS_IDLE_MAX) ufs_idle_max = UFS_IDLE_MAX; if (ufs_idle_max > ufs_ninode) ufs_idle_max = ufs_ninode; /* * This is really a misnomer, it is ufs_queue_init */ ufs_thread_init(&ufs_idle_q, ufs_idle_max); ufs_thread_start(&ufs_idle_q, ufs_thread_idle, NULL); /* * global hlock thread */ ufs_thread_init(&ufs_hlock, 1); ufs_thread_start(&ufs_hlock, ufs_thread_hlock, NULL); ihinit(); qtinit(); ins.in_maxsize.value.ul = ufs_ninode; if ((ufs_inode_kstat = kstat_create("ufs", 0, "inode_cache", "ufs", KSTAT_TYPE_NAMED, sizeof (ins) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL)) != NULL) { ufs_inode_kstat->ks_data = (void *)&ins; ufs_inode_kstat->ks_update = ufs_inode_kstat_update; kstat_install(ufs_inode_kstat); } ufsfx_init(); /* fix-on-panic initialization */ si_cache_init(); ufs_directio_init(); lufs_init(); mutex_init(&ufs_iuniqtime_lock, NULL, MUTEX_DEFAULT, NULL); } /* ARGSUSED */ static int ufs_inode_cache_constructor(void *buf, void *cdrarg, int kmflags) { struct inode *ip = buf; struct vnode *vp; rw_init(&ip->i_rwlock, NULL, RW_DEFAULT, NULL); rw_init(&ip->i_contents, NULL, RW_DEFAULT, NULL); mutex_init(&ip->i_tlock, NULL, MUTEX_DEFAULT, NULL); dnlc_dir_init(&ip->i_danchor); cv_init(&ip->i_wrcv, NULL, CV_DRIVER, NULL); vp = vn_alloc(KM_SLEEP); ip->i_vnode = vp; vn_setops(vp, ufs_vnodeops); vp->v_data = (caddr_t)ip; return (0); } /* ARGSUSED */ static void ufs_inode_cache_destructor(void *buf, void *cdrarg) { struct inode *ip = buf; struct vnode *vp; vp = ITOV(ip); rw_destroy(&ip->i_rwlock); rw_destroy(&ip->i_contents); mutex_destroy(&ip->i_tlock); if (vp->v_type == VDIR) { dnlc_dir_fini(&ip->i_danchor); } cv_destroy(&ip->i_wrcv); vn_free(vp); } /* * Initialize hash links for inodes * and build inode free list. */ void ihinit(void) { int i; union ihead *ih = ihead; mutex_init(&ufs_scan_lock, NULL, MUTEX_DEFAULT, NULL); inohsz = 1 << highbit(ufs_ninode / ino_hashlen); ihead = kmem_zalloc(inohsz * sizeof (union ihead), KM_SLEEP); ih_lock = kmem_zalloc(inohsz * sizeof (kmutex_t), KM_SLEEP); for (i = 0, ih = ihead; i < inohsz; i++, ih++) { ih->ih_head[0] = ih; ih->ih_head[1] = ih; mutex_init(&ih_lock[i], NULL, MUTEX_DEFAULT, NULL); } inode_cache = kmem_cache_create("ufs_inode_cache", sizeof (struct inode), 0, ufs_inode_cache_constructor, ufs_inode_cache_destructor, ufs_inode_cache_reclaim, NULL, NULL, 0); } /* * Free an inode structure */ void ufs_free_inode(struct inode *ip) { vn_invalid(ITOV(ip)); kmem_cache_free(inode_cache, ip); } /* * Allocate an inode structure */ struct inode * ufs_alloc_inode(ufsvfs_t *ufsvfsp, ino_t ino) { struct inode *ip; vnode_t *vp; ip = kmem_cache_alloc(inode_cache, KM_SLEEP); /* * at this point we have a newly allocated inode */ ip->i_freef = ip; ip->i_freeb = ip; ip->i_flag = IREF; ip->i_seq = 0xFF; /* Unique initial value */ ip->i_dev = ufsvfsp->vfs_dev; ip->i_ufsvfs = ufsvfsp; ip->i_devvp = ufsvfsp->vfs_devvp; ip->i_number = ino; ip->i_diroff = 0; ip->i_nextr = 0; ip->i_map = NULL; ip->i_rdev = 0; ip->i_writes = 0; ip->i_mode = 0; ip->i_delaylen = 0; ip->i_delayoff = 0; ip->i_nextrio = 0; ip->i_ufs_acl = NULL; ip->i_cflags = 0; ip->i_mapcnt = 0; ip->i_dquot = NULL; ip->i_cachedir = CD_ENABLED; ip->i_writer = NULL; /* * the vnode for this inode was allocated by the constructor */ vp = ITOV(ip); vn_reinit(vp); if (ino == (ino_t)UFSROOTINO) vp->v_flag = VROOT; vp->v_vfsp = ufsvfsp->vfs_vfs; vn_exists(vp); return (ip); } /* * Look up an inode by device, inumber. If it is in core (in the * inode structure), honor the locking protocol. If it is not in * core, read it in from the specified device after freeing any pages. * In all cases, a pointer to a VN_HELD inode structure is returned. */ int ufs_iget(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr) { return (ufs_iget_internal(vfsp, ino, ipp, cr, 0)); } /* * A version of ufs_iget which returns only allocated, linked inodes. * This is appropriate for any callers who do not expect a free inode. */ int ufs_iget_alloced(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr) { return (ufs_iget_internal(vfsp, ino, ipp, cr, 1)); } /* * Set vnode attributes based on v_type, this should be called whenever * an inode's i_mode is changed. */ void ufs_reset_vnode(vnode_t *vp) { /* * an old DBE hack */ if ((VTOI(vp)->i_mode & (ISVTX | IEXEC | IFDIR)) == ISVTX) vp->v_flag |= VSWAPLIKE; else vp->v_flag &= ~VSWAPLIKE; /* * if not swap like and it's just a regular file, we want * to maintain the vnode's pages sorted by clean/modified * for faster sync'ing to disk */ if (vp->v_type == VREG) vp->v_flag |= VMODSORT; else vp->v_flag &= ~VMODSORT; /* * Is this an attribute hidden dir? */ if ((VTOI(vp)->i_mode & IFMT) == IFATTRDIR) vp->v_flag |= V_XATTRDIR; else vp->v_flag &= ~V_XATTRDIR; } /* * Shared implementation of ufs_iget and ufs_iget_alloced. The 'validate' * flag is used to distinguish the two; when true, we validate that the inode * being retrieved looks like a linked and allocated inode. */ /* ARGSUSED */ static int ufs_iget_internal(struct vfs *vfsp, ino_t ino, struct inode **ipp, struct cred *cr, int validate) { struct inode *ip, *sp; union ihead *ih; kmutex_t *ihm; struct buf *bp; struct dinode *dp; struct vnode *vp; extern vfs_t EIO_vfs; int error; int ftype; /* XXX - Remove later on */ dev_t vfs_dev; struct ufsvfs *ufsvfsp; struct fs *fs; int hno; daddr_t bno; ulong_t ioff; CPU_STATS_ADD_K(sys, ufsiget, 1); /* * Lookup inode in cache. */ vfs_dev = vfsp->vfs_dev; hno = INOHASH(ino); ih = &ihead[hno]; ihm = &ih_lock[hno]; again: mutex_enter(ihm); for (ip = ih->ih_chain[0]; ip != (struct inode *)ih; ip = ip->i_forw) { if (ino != ip->i_number || vfs_dev != ip->i_dev || (ip->i_flag & ISTALE)) continue; /* * Found the interesting inode; hold it and drop the cache lock */ vp = ITOV(ip); /* for locknest */ VN_HOLD(vp); mutex_exit(ihm); rw_enter(&ip->i_contents, RW_READER); /* * if necessary, remove from idle list */ if ((ip->i_flag & IREF) == 0) { if (ufs_rmidle(ip)) VN_RELE(vp); } /* * Could the inode be read from disk? */ if (ip->i_flag & ISTALE) { rw_exit(&ip->i_contents); VN_RELE(vp); goto again; } ins.in_hits.value.ul++; *ipp = ip; /* * Reset the vnode's attribute flags */ mutex_enter(&vp->v_lock); ufs_reset_vnode(vp); mutex_exit(&vp->v_lock); rw_exit(&ip->i_contents); return (0); } mutex_exit(ihm); /* * Inode was not in cache. * * Allocate a new entry */ ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; fs = ufsvfsp->vfs_fs; ip = ufs_alloc_inode(ufsvfsp, ino); vp = ITOV(ip); bno = fsbtodb(fs, itod(fs, ino)); ioff = (sizeof (struct dinode)) * (itoo(fs, ino)); ip->i_doff = (offset_t)ioff + ldbtob(bno); /* * put a place holder in the cache (if not already there) */ mutex_enter(ihm); for (sp = ih->ih_chain[0]; sp != (struct inode *)ih; sp = sp->i_forw) if (ino == sp->i_number && vfs_dev == sp->i_dev && ((sp->i_flag & ISTALE) == 0)) { mutex_exit(ihm); ufs_free_inode(ip); goto again; } /* * It would be nice to ASSERT(RW_READ_HELD(&ufsvfsp->vfs_dqrwlock)) * here, but if we do, then shadow inode allocations panic the * system. We don't have to hold vfs_dqrwlock for shadow inodes * and the ufs_iget() parameters don't tell us what we are getting * so we have no way of knowing this is a ufs_iget() call from * a ufs_ialloc() call for a shadow inode. */ rw_enter(&ip->i_contents, RW_WRITER); insque(ip, ih); mutex_exit(ihm); /* * read the dinode */ bp = UFS_BREAD(ufsvfsp, ip->i_dev, bno, (int)fs->fs_bsize); /* * Check I/O errors */ error = ((bp->b_flags & B_ERROR) ? geterror(bp) : 0); if (error) { brelse(bp); ip->i_flag |= ISTALE; /* in case someone is looking it up */ rw_exit(&ip->i_contents); vp->v_vfsp = &EIO_vfs; VN_RELE(vp); return (error); } /* * initialize the inode's dinode */ dp = (struct dinode *)(ioff + bp->b_un.b_addr); ip->i_ic = dp->di_ic; /* structure assignment */ brelse(bp); /* * Maintain compatibility with Solaris 1.x UFS */ if (ip->i_suid != UID_LONG) ip->i_uid = ip->i_suid; if (ip->i_sgid != GID_LONG) ip->i_gid = ip->i_sgid; ftype = ip->i_mode & IFMT; if (ftype == IFBLK || ftype == IFCHR) { dev_t dv; uint_t top16 = ip->i_ordev & 0xffff0000u; if (top16 == 0 || top16 == 0xffff0000u) dv = expdev(ip->i_ordev); else dv = expldev(ip->i_ordev); vp->v_rdev = ip->i_rdev = dv; } /* * if our caller only expects allocated inodes, verify that * this inode looks good; throw it out if it's bad. */ if (validate) { if ((ftype == 0) || (ip->i_nlink <= 0)) { ip->i_flag |= ISTALE; rw_exit(&ip->i_contents); vp->v_vfsp = &EIO_vfs; VN_RELE(vp); cmn_err(CE_NOTE, "%s: unexpected free inode %d, run fsck(1M)%s", fs->fs_fsmnt, (int)ino, (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); return (EIO); } } /* * Finish initializing the vnode, special handling for shadow inodes * because IFTOVT() will produce a v_type of VNON which is not what we * want, set v_type to VREG explicitly in that case. */ if (ftype == IFSHAD) { vp->v_type = VREG; } else { vp->v_type = IFTOVT((mode_t)ip->i_mode); } ufs_reset_vnode(vp); /* * read the shadow */ if (ftype != 0 && ip->i_shadow != 0) { if ((error = ufs_si_load(ip, cr)) != 0) { ip->i_flag |= ISTALE; ip->i_ufs_acl = NULL; rw_exit(&ip->i_contents); vp->v_vfsp = &EIO_vfs; VN_RELE(vp); return (error); } } /* * Only attach quota information if the inode has a type and if * that type is not a shadow inode. */ if (ip->i_mode && ((ip->i_mode & IFMT) != IFSHAD) && ((ip->i_mode & IFMT) != IFATTRDIR)) { ip->i_dquot = getinoquota(ip); } TRANS_MATA_IGET(ufsvfsp, ip); *ipp = ip; rw_exit(&ip->i_contents); return (0); } /* * Vnode is no longer referenced, write the inode out * and if necessary, truncate and deallocate the file. */ void ufs_iinactive(struct inode *ip) { int front; struct inode *iq; struct inode *hip; struct ufs_q *uq; struct vnode *vp = ITOV(ip); struct ufsvfs *ufsvfsp = ip->i_ufsvfs; struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info; /* * Because the vnode type might have been changed, * the dnlc_dir_purge must be called unconditionally. */ dnlc_dir_purge(&ip->i_danchor); /* * Get exclusive access to inode data. */ rw_enter(&ip->i_contents, RW_WRITER); ASSERT(ip->i_flag & IREF); /* * Make sure no one reclaimed the inode before we put it on * the freelist or destroy it. We keep our 'hold' on the vnode * from vn_rele until we are ready to do something with the inode. * * Pageout may put a VN_HOLD/VN_RELE at anytime during this * operation via an async putpage, so we must make sure * we don't free/destroy the inode more than once. ufs_iget * may also put a VN_HOLD on the inode before it grabs * the i_contents lock. This is done so we don't free * an inode that a thread is waiting on. */ mutex_enter(&vp->v_lock); if (vp->v_count > 1) { vp->v_count--; /* release our hold from vn_rele */ mutex_exit(&vp->v_lock); rw_exit(&ip->i_contents); return; } mutex_exit(&vp->v_lock); /* * For umount case: if ufsvfs ptr is NULL, the inode is unhashed * and clean. It can be safely destroyed (cyf). */ if (ip->i_ufsvfs == NULL) { rw_exit(&ip->i_contents); ufs_si_del(ip); ASSERT((vp->v_type == VCHR) || !vn_has_cached_data(vp)); ufs_free_inode(ip); return; } /* * queue idle inode to appropriate thread. Will check v_count == 1 * prior to putting this on the appropriate queue. * Stale inodes will be unhashed and freed by the ufs idle thread * in ufs_idle_free() */ front = 1; if ((ip->i_flag & ISTALE) == 0 && ip->i_fs->fs_ronly == 0 && ip->i_mode && ip->i_nlink <= 0) { /* * Mark the i_flag to indicate that inode is being deleted. * This flag will be cleared when the deletion is complete. * This prevents nfs from sneaking in via ufs_vget() while * the delete is in progress (bugid 1242481). */ ip->i_flag |= IDEL; /* * NOIDEL means that deletes are not allowed at this time; * whoever resets NOIDEL will also send this inode back * through ufs_iinactive. IREF remains set. */ if (ULOCKFS_IS_NOIDEL(ITOUL(ip))) { mutex_enter(&vp->v_lock); vp->v_count--; mutex_exit(&vp->v_lock); rw_exit(&ip->i_contents); return; } if (!TRANS_ISTRANS(ip->i_ufsvfs)) { rw_exit(&ip->i_contents); ufs_delete(ip->i_ufsvfs, ip, 0); return; } /* queue to delete thread; IREF remains set */ ins.in_qfree.value.ul++; uq = &ip->i_ufsvfs->vfs_delete; mutex_enter(&uq->uq_mutex); /* add to q */ if ((iq = uq->uq_ihead) != 0) { ip->i_freef = iq; ip->i_freeb = iq->i_freeb; iq->i_freeb->i_freef = ip; iq->i_freeb = ip; if (front) uq->uq_ihead = ip; } else { uq->uq_ihead = ip; ip->i_freef = ip; ip->i_freeb = ip; } delq_info->delq_unreclaimed_files += 1; delq_info->delq_unreclaimed_blocks += ip->i_blocks; } else { /* * queue to idle thread * Check the v_count == 1 again. * */ mutex_enter(&vp->v_lock); if (vp->v_count > 1) { vp->v_count--; /* release our hold from vn_rele */ mutex_exit(&vp->v_lock); rw_exit(&ip->i_contents); return; } mutex_exit(&vp->v_lock); uq = &ufs_idle_q; /* * useful iff it has pages or is a fastsymlink; otherwise junk */ mutex_enter(&uq->uq_mutex); /* clear IREF means `on idle list' */ ip->i_flag &= ~(IREF | IDIRECTIO); if (vn_has_cached_data(vp) || ip->i_flag & IFASTSYMLNK) { ins.in_frback.value.ul++; hip = (inode_t *)&ufs_useful_iq[IQHASH(ip)]; ufs_nuseful_iq++; } else { ins.in_frfront.value.ul++; hip = (inode_t *)&ufs_junk_iq[IQHASH(ip)]; ip->i_flag |= IJUNKIQ; ufs_njunk_iq++; } ip->i_freef = hip; ip->i_freeb = hip->i_freeb; hip->i_freeb->i_freef = ip; hip->i_freeb = ip; } /* wakeup thread(s) if q is overfull */ if (++uq->uq_ne == uq->uq_lowat) cv_broadcast(&uq->uq_cv); /* all done, release the q and inode */ mutex_exit(&uq->uq_mutex); rw_exit(&ip->i_contents); } /* * Check accessed and update flags on an inode structure. * If any are on, update the inode with the (unique) current time. * If waitfor is given, insure I/O order so wait for write to complete. */ void ufs_iupdat(struct inode *ip, int waitfor) { struct buf *bp; struct fs *fp; struct dinode *dp; struct ufsvfs *ufsvfsp = ip->i_ufsvfs; int i; int do_trans_times; ushort_t flag; o_uid_t suid; o_gid_t sgid; /* * This function is now safe to be called with either the reader * or writer i_contents lock. */ ASSERT(RW_LOCK_HELD(&ip->i_contents)); /* * Return if file system has been forcibly umounted. */ if (ufsvfsp == NULL) return; flag = ip->i_flag; /* Atomic read */ /* * We better not update the disk inode from a stale inode. */ if (flag & ISTALE) return; fp = ip->i_fs; if ((flag & (IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG)) != 0) { if (fp->fs_ronly) { mutex_enter(&ip->i_tlock); ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); mutex_exit(&ip->i_tlock); return; } /* * fs is active while metadata is being written */ mutex_enter(&ufsvfsp->vfs_lock); ufs_notclean(ufsvfsp); /* * get the dinode */ bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fp, itod(fp, ip->i_number)), (int)fp->fs_bsize); if (bp->b_flags & B_ERROR) { mutex_enter(&ip->i_tlock); ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); mutex_exit(&ip->i_tlock); brelse(bp); return; } /* * munge inode fields */ mutex_enter(&ip->i_tlock); ITIMES_NOLOCK(ip); do_trans_times = ((ip->i_flag & (IMOD|IMODACC)) == IMODACC); ip->i_flag &= ~(IUPD|IACC|ICHG|IMOD|IMODACC|IATTCHG); mutex_exit(&ip->i_tlock); /* * For reads and concurrent re-writes, no deltas were * entered for the access time changes - do it now. */ if (do_trans_times) { TRANS_INODE_TIMES(ufsvfsp, ip); } /* * For SunOS 5.0->5.4, these lines below read: * * suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; * sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; * * where MAXUID was set to 60002. This was incorrect - * the uids should have been constrained to what fitted into * a 16-bit word. * * This means that files from 4.x filesystems that have an * i_suid field larger than 60002 will have that field * changed to 65535. * * Security note: 4.x UFS could never create a i_suid of * UID_LONG since that would've corresponded to -1. */ suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? UID_LONG : ip->i_uid; sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? GID_LONG : ip->i_gid; if ((ip->i_suid != suid) || (ip->i_sgid != sgid)) { ip->i_suid = suid; ip->i_sgid = sgid; TRANS_INODE(ufsvfsp, ip); } if ((ip->i_mode & IFMT) == IFBLK || (ip->i_mode & IFMT) == IFCHR) { dev_t d = ip->i_rdev; dev32_t dev32; /* * load first direct block only if special device */ if (!cmpldev(&dev32, d)) { /* * We panic here because there's "no way" * we should have been able to create a large * inode with a large dev_t. Earlier layers * should've caught this. */ panic("ip %p: i_rdev too big", (void *)ip); } if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { ip->i_ordev = dev32; /* can't use old fmt. */ } else { ip->i_ordev = cmpdev(d); } } /* * copy inode to dinode (zero fastsymlnk in dinode) */ dp = (struct dinode *)bp->b_un.b_addr + itoo(fp, ip->i_number); dp->di_ic = ip->i_ic; /* structure assignment */ if (flag & IFASTSYMLNK) { for (i = 1; i < NDADDR; i++) dp->di_db[i] = 0; for (i = 0; i < NIADDR; i++) dp->di_ib[i] = 0; } if (TRANS_ISTRANS(ufsvfsp)) { /* * Pass only a sector size buffer containing * the inode, otherwise when the buffer is copied * into a cached roll buffer then too much memory * gets consumed if 8KB inode buffers are passed. */ TRANS_LOG(ufsvfsp, (caddr_t)dp, ip->i_doff, sizeof (struct dinode), (caddr_t)P2ALIGN((uintptr_t)dp, DEV_BSIZE), DEV_BSIZE); brelse(bp); } else if (waitfor && (ip->i_ufsvfs->vfs_dio == 0)) { UFS_BRWRITE(ufsvfsp, bp); /* * Synchronous write has guaranteed that inode * has been written on disk so clear the flag */ mutex_enter(&ip->i_tlock); ip->i_flag &= ~IBDWRITE; mutex_exit(&ip->i_tlock); } else { bdrwrite(bp); /* * This write hasn't guaranteed that inode has been * written on the disk. * Since, all updat flags on inode are cleared, we must * remember the condition in case inode is to be updated * synchronously later (e.g.- fsync()/fdatasync()) * and inode has not been modified yet. */ mutex_enter(&ip->i_tlock); ip->i_flag |= IBDWRITE; mutex_exit(&ip->i_tlock); } } else { /* * In case previous inode update was done asynchronously * (IBDWRITE) and this inode update request wants guaranteed * (synchronous) disk update, flush the inode. */ if (waitfor && (flag & IBDWRITE)) { blkflush(ip->i_dev, (daddr_t)fsbtodb(fp, itod(fp, ip->i_number))); mutex_enter(&ip->i_tlock); ip->i_flag &= ~IBDWRITE; mutex_exit(&ip->i_tlock); } } } #define SINGLE 0 /* index of single indirect block */ #define DOUBLE 1 /* index of double indirect block */ #define TRIPLE 2 /* index of triple indirect block */ /* * Release blocks associated with the inode ip and * stored in the indirect block bn. Blocks are free'd * in LIFO order up to (but not including) lastbn. If * level is greater than SINGLE, the block is an indirect * block and recursive calls to indirtrunc must be used to * cleanse other indirect blocks. * * N.B.: triple indirect blocks are untested. */ static long indirtrunc(struct inode *ip, daddr_t bn, daddr_t lastbn, int level, int flags) { int i; struct buf *bp, *copy; daddr32_t *bap; struct ufsvfs *ufsvfsp = ip->i_ufsvfs; struct fs *fs = ufsvfsp->vfs_fs; daddr_t nb, last; long factor; int blocksreleased = 0, nblocks; ASSERT(RW_WRITE_HELD(&ip->i_contents)); /* * Calculate index in current block of last * block to be kept. -1 indicates the entire * block so we need not calculate the index. */ factor = 1; for (i = SINGLE; i < level; i++) factor *= NINDIR(fs); last = lastbn; if (lastbn > 0) last /= factor; nblocks = btodb(fs->fs_bsize); /* * Get buffer of block pointers, zero those * entries corresponding to blocks to be free'd, * and update on disk copy first. * *Unless* the root pointer has been synchronously * written to disk. If nothing points to this * indirect block then don't bother zero'ing and * writing it. */ bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, bn), (int)fs->fs_bsize); if (bp->b_flags & B_ERROR) { brelse(bp); return (0); } bap = bp->b_un.b_daddr; if ((flags & I_CHEAP) == 0) { uint_t zb; zb = (uint_t)((NINDIR(fs) - (last + 1)) * sizeof (daddr32_t)); if (zb) { /* * push any data into the log before we zero it */ if (bp->b_flags & B_DELWRI) TRANS_LOG(ufsvfsp, (caddr_t)bap, ldbtob(bp->b_blkno), bp->b_bcount, bp->b_un.b_addr, bp->b_bcount); copy = ngeteblk(fs->fs_bsize); bcopy((caddr_t)bap, (caddr_t)copy->b_un.b_daddr, (uint_t)fs->fs_bsize); bzero((caddr_t)&bap[last + 1], zb); TRANS_BUF(ufsvfsp, (caddr_t)&bap[last + 1] - (caddr_t)bap, zb, bp, DT_ABZERO); UFS_BRWRITE(ufsvfsp, bp); bp = copy, bap = bp->b_un.b_daddr; } } else { /* make sure write retries are also cleared */ bp->b_flags &= ~(B_DELWRI | B_RETRYWRI); bp->b_flags |= B_STALE | B_AGE; } /* * Recursively free totally unused blocks. */ flags |= I_CHEAP; for (i = NINDIR(fs) - 1; i > last; i--) { nb = bap[i]; if (nb == 0) continue; if (level > SINGLE) { blocksreleased += indirtrunc(ip, nb, (daddr_t)-1, level - 1, flags); free(ip, nb, (off_t)fs->fs_bsize, flags | I_IBLK); } else free(ip, nb, (off_t)fs->fs_bsize, flags); blocksreleased += nblocks; } flags &= ~I_CHEAP; /* * Recursively free last partial block. */ if (level > SINGLE && lastbn >= 0) { last = lastbn % factor; nb = bap[i]; if (nb != 0) blocksreleased += indirtrunc(ip, nb, last, level - 1, flags); } brelse(bp); return (blocksreleased); } /* * Truncate the inode ip to at most length size. * Free affected disk blocks -- the blocks of the * file are removed in reverse order. * * N.B.: triple indirect blocks are untested. */ static int i_genrand = 1234; int ufs_itrunc(struct inode *oip, u_offset_t length, int flags, cred_t *cr) { struct fs *fs = oip->i_fs; struct ufsvfs *ufsvfsp = oip->i_ufsvfs; struct inode *ip; daddr_t lastblock; off_t bsize; int boff; daddr_t bn, lastiblock[NIADDR]; int level; long nblocks, blocksreleased = 0; int i; ushort_t mode; struct inode tip; int err; u_offset_t maxoffset = (ufsvfsp->vfs_lfflags & UFS_LARGEFILES) ? (UFS_MAXOFFSET_T) : (MAXOFF32_T); /* * Shadow inodes do not need to hold the vfs_dqrwlock lock. Most * other uses need the reader lock. opendq() holds the writer lock. */ ASSERT((oip->i_mode & IFMT) == IFSHAD || RW_LOCK_HELD(&ufsvfsp->vfs_dqrwlock)); ASSERT(RW_WRITE_HELD(&oip->i_contents)); /* * We only allow truncation of regular files and directories * to arbitrary lengths here. In addition, we allow symbolic * links to be truncated only to zero length. Other inode * types cannot have their length set here. Disk blocks are * being dealt with - especially device inodes where * ip->i_ordev is actually being stored in ip->i_db[0]! */ TRANS_INODE(ufsvfsp, oip); mode = oip->i_mode & IFMT; if (flags & I_FREE) { i_genrand *= 16843009; /* turns into shift and adds */ i_genrand++; oip->i_gen += ((i_genrand + lbolt) & 0xffff) + 1; oip->i_flag |= ICHG |IUPD; oip->i_seq++; if (length == oip->i_size) return (0); flags |= I_CHEAP; } if (mode == IFIFO) return (0); if (mode != IFREG && mode != IFDIR && mode != IFATTRDIR && !(mode == IFLNK && length == (offset_t)0) && mode != IFSHAD) return (EINVAL); if (length > maxoffset) return (EFBIG); if ((mode == IFDIR) || (mode == IFATTRDIR)) flags |= I_DIR; if (mode == IFSHAD) flags |= I_SHAD; if (oip == ufsvfsp->vfs_qinod) flags |= I_QUOTA; if (length == oip->i_size) { /* update ctime and mtime to please POSIX tests */ oip->i_flag |= ICHG |IUPD; oip->i_seq++; if (length == 0) { /* nothing to cache so clear the flag */ oip->i_flag &= ~IFASTSYMLNK; } return (0); } /* wipe out fast symlink till next access */ if (oip->i_flag & IFASTSYMLNK) { int j; ASSERT(ITOV(oip)->v_type == VLNK); oip->i_flag &= ~IFASTSYMLNK; for (j = 1; j < NDADDR; j++) oip->i_db[j] = 0; for (j = 0; j < NIADDR; j++) oip->i_ib[j] = 0; } boff = (int)blkoff(fs, length); if (length > oip->i_size) { /* * Trunc up case. BMAPALLOC will insure that the right blocks * are allocated. This includes extending the old frag to a * full block (if needed) in addition to doing any work * needed for allocating the last block. */ if (boff == 0) err = BMAPALLOC(oip, length - 1, (int)fs->fs_bsize, cr); else err = BMAPALLOC(oip, length - 1, boff, cr); if (err == 0) { /* * Save old size and set inode's size now * so that we don't cause too much of the * file to be zero'd and pushed. */ u_offset_t osize = oip->i_size; oip->i_size = length; /* * Make sure we zero out the remaining bytes of * the page in case a mmap scribbled on it. We * can't prevent a mmap from writing beyond EOF * on the last page of a file. * */ if ((boff = (int)blkoff(fs, osize)) != 0) { bsize = (int)lblkno(fs, osize - 1) >= NDADDR ? fs->fs_bsize : fragroundup(fs, boff); pvn_vpzero(ITOV(oip), osize, (size_t)(bsize - boff)); } oip->i_flag |= ICHG|IATTCHG; oip->i_seq++; ITIMES_NOLOCK(oip); /* * MAXOFF32_T is old 2GB size limit. If * this operation caused a large file to be * created, turn on the superblock flag * and update the superblock, if the flag * is not already on. */ if ((length > (u_offset_t)MAXOFF32_T) && !(fs->fs_flags & FSLARGEFILES)) { ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); mutex_enter(&ufsvfsp->vfs_lock); fs->fs_flags |= FSLARGEFILES; ufs_sbwrite(ufsvfsp); mutex_exit(&ufsvfsp->vfs_lock); } } return (err); } /* * Update the pages of the file. If the file is not being * truncated to a block boundary, the contents of the * pages following the end of the file must be zero'ed * in case it ever become accessible again because * of subsequent file growth. */ if (boff == 0) { (void) pvn_vplist_dirty(ITOV(oip), length, ufs_putapage, B_INVAL | B_TRUNC, CRED()); } else { /* * Make sure that the last block is properly allocated. * We only really have to do this if the last block is * actually allocated since ufs_bmap will now handle the case * of an fragment which has no block allocated. Just to * be sure, we do it now independent of current allocation. */ err = BMAPALLOC(oip, length - 1, boff, cr); if (err) return (err); /* * BMAPALLOC will call bmap_write which defers i_seq * processing. If the timestamps were changed, update * i_seq before rdip drops i_contents or syncs the inode. */ if (oip->i_flag & (ICHG|IUPD)) oip->i_seq++; /* * BugId 4069932 * Make sure that the relevant partial page appears in * the v_pages list, so that pvn_vpzero() will do its * job. Since doing this correctly requires everything * in rdip() except for the uiomove(), it's easier and * safer to do the uiomove() rather than duplicate the * rest of rdip() here. * * To get here, we know that length indicates a byte * that is not the first byte of a block. (length - 1) * is the last actual byte known to exist. Deduction * shows it is in the same block as byte (length). * Thus, this rdip() invocation should always succeed * except in the face of i/o errors, and give us the * block we care about. * * rdip() makes the same locking assertions and * assumptions as we do. We do not acquire any locks * before calling it, so we have not changed the locking * situation. Finally, there do not appear to be any * paths whereby rdip() ends up invoking us again. * Thus, infinite recursion is avoided. */ { uio_t uio; iovec_t iov[1]; char buffer; uio.uio_iov = iov; uio.uio_iovcnt = 1; uio.uio_loffset = length - 1; uio.uio_resid = 1; uio.uio_segflg = UIO_SYSSPACE; uio.uio_extflg = UIO_COPY_CACHED; iov[0].iov_base = &buffer; iov[0].iov_len = 1; err = rdip(oip, &uio, UIO_READ, NULL); if (err) return (err); } bsize = (int)lblkno(fs, length - 1) >= NDADDR ? fs->fs_bsize : fragroundup(fs, boff); pvn_vpzero(ITOV(oip), length, (size_t)(bsize - boff)); /* * Ensure full fs block is marked as dirty. */ (void) pvn_vplist_dirty(ITOV(oip), length + (bsize - boff), ufs_putapage, B_INVAL | B_TRUNC, CRED()); } /* * Calculate index into inode's block list of * last direct and indirect blocks (if any) * which we want to keep. Lastblock is -1 when * the file is truncated to 0. */ lastblock = lblkno(fs, length + fs->fs_bsize - 1) - 1; lastiblock[SINGLE] = lastblock - NDADDR; lastiblock[DOUBLE] = lastiblock[SINGLE] - NINDIR(fs); lastiblock[TRIPLE] = lastiblock[DOUBLE] - NINDIR(fs) * NINDIR(fs); nblocks = btodb(fs->fs_bsize); /* * Update file and block pointers * on disk before we start freeing blocks. * If we crash before free'ing blocks below, * the blocks will be returned to the free list. * lastiblock values are also normalized to -1 * for calls to indirtrunc below. */ tip = *oip; /* structure copy */ ip = &tip; for (level = TRIPLE; level >= SINGLE; level--) if (lastiblock[level] < 0) { oip->i_ib[level] = 0; lastiblock[level] = -1; } for (i = NDADDR - 1; i > lastblock; i--) { oip->i_db[i] = 0; flags |= I_CHEAP; } oip->i_size = length; oip->i_flag |= ICHG|IUPD|IATTCHG; oip->i_seq++; if (!TRANS_ISTRANS(ufsvfsp)) ufs_iupdat(oip, I_SYNC); /* do sync inode update */ /* * Indirect blocks first. */ for (level = TRIPLE; level >= SINGLE; level--) { bn = ip->i_ib[level]; if (bn != 0) { blocksreleased += indirtrunc(ip, bn, lastiblock[level], level, flags); if (lastiblock[level] < 0) { ip->i_ib[level] = 0; free(ip, bn, (off_t)fs->fs_bsize, flags | I_IBLK); blocksreleased += nblocks; } } if (lastiblock[level] >= 0) goto done; } /* * All whole direct blocks or frags. */ for (i = NDADDR - 1; i > lastblock; i--) { bn = ip->i_db[i]; if (bn == 0) continue; ip->i_db[i] = 0; bsize = (off_t)blksize(fs, ip, i); free(ip, bn, bsize, flags); blocksreleased += btodb(bsize); } if (lastblock < 0) goto done; /* * Finally, look for a change in size of the * last direct block; release any frags. */ bn = ip->i_db[lastblock]; if (bn != 0) { off_t oldspace, newspace; /* * Calculate amount of space we're giving * back as old block size minus new block size. */ oldspace = blksize(fs, ip, lastblock); UFS_SET_ISIZE(length, ip); newspace = blksize(fs, ip, lastblock); if (newspace == 0) { err = ufs_fault(ITOV(ip), "ufs_itrunc: newspace == 0"); return (err); } if (oldspace - newspace > 0) { /* * Block number of space to be free'd is * the old block # plus the number of frags * required for the storage we're keeping. */ bn += numfrags(fs, newspace); free(ip, bn, oldspace - newspace, flags); blocksreleased += btodb(oldspace - newspace); } } done: /* BEGIN PARANOIA */ for (level = SINGLE; level <= TRIPLE; level++) if (ip->i_ib[level] != oip->i_ib[level]) { err = ufs_fault(ITOV(ip), "ufs_itrunc: indirect block"); return (err); } for (i = 0; i < NDADDR; i++) if (ip->i_db[i] != oip->i_db[i]) { err = ufs_fault(ITOV(ip), "ufs_itrunc: direct block"); return (err); } /* END PARANOIA */ oip->i_blocks -= blocksreleased; if (oip->i_blocks < 0) { /* sanity */ cmn_err(CE_NOTE, "ufs_itrunc: %s/%d new size = %lld, blocks = %d\n", fs->fs_fsmnt, (int)oip->i_number, oip->i_size, (int)oip->i_blocks); oip->i_blocks = 0; } oip->i_flag |= ICHG|IATTCHG; oip->i_seq++; /* blocksreleased is >= zero, so this can not fail */ (void) chkdq(oip, -blocksreleased, 0, cr, (char **)NULL, (size_t *)NULL); return (0); } /* * Check mode permission on inode. Mode is READ, WRITE or EXEC. * In the case of WRITE, the read-only status of the file system * is checked. Depending on the calling user, the appropriate * mode bits are selected; privileges to override missing permission * bits are checked through secpolicy_vnode_access(). */ int ufs_iaccess(void *vip, int mode, struct cred *cr) { struct inode *ip = vip; int shift = 0; if (mode & IWRITE) { /* * Disallow write attempts on read-only * file systems, unless the file is a block * or character device or a FIFO. */ if (ip->i_fs->fs_ronly != 0) { if ((ip->i_mode & IFMT) != IFCHR && (ip->i_mode & IFMT) != IFBLK && (ip->i_mode & IFMT) != IFIFO) { return (EROFS); } } } /* * If there is a shadow inode check for the presence of an acl, * if the acl is there use the ufs_acl_access routine to check * the acl */ if (ip->i_ufs_acl && ip->i_ufs_acl->aowner) return (ufs_acl_access(ip, mode, cr)); /* * Access check is based on only * one of owner, group, public. * If not owner, then check group. * If not a member of the group, then * check public access. */ if (crgetuid(cr) != ip->i_uid) { shift += 3; if (!groupmember((uid_t)ip->i_gid, cr)) shift += 3; } mode &= ~(ip->i_mode << shift); if (mode == 0) return (0); /* test missing privilege bits */ return (secpolicy_vnode_access(cr, ITOV(ip), ip->i_uid, mode)); } /* * if necessary, remove an inode from the free list * i_contents is held except at unmount * * Return 1 if the inode is taken off of the ufs_idle_q, * and the caller is expected to call VN_RELE. * * Return 0 otherwise. */ int ufs_rmidle(struct inode *ip) { int rval = 0; mutex_enter(&ip->i_tlock); if ((ip->i_flag & IREF) == 0) { mutex_enter(&ufs_idle_q.uq_mutex); ip->i_freef->i_freeb = ip->i_freeb; ip->i_freeb->i_freef = ip->i_freef; ip->i_freef = ip; ip->i_freeb = ip; ip->i_flag |= IREF; ufs_idle_q.uq_ne--; if (ip->i_flag & IJUNKIQ) { ufs_njunk_iq--; ip->i_flag &= ~IJUNKIQ; } else { ufs_nuseful_iq--; } mutex_exit(&ufs_idle_q.uq_mutex); rval = 1; } mutex_exit(&ip->i_tlock); return (rval); } /* * scan the hash of inodes and call func with the inode locked */ int ufs_scan_inodes(int rwtry, int (*func)(struct inode *, void *), void *arg, struct ufsvfs *ufsvfsp) { struct inode *ip; /* current inode */ struct inode *lip = NULL; /* last/previous inode */ union ihead *ih; /* current hash chain */ int error, i; int saverror = 0; int lip_held; /* lip needs a VN_RELE() */ /* * If ufsvfsp is NULL, then our caller should be holding * ufs_scan_lock to avoid conflicts between ufs_unmount() and * ufs_update(). Otherwise, to avoid false-positives in * ufs_unmount()'s v_count-based EBUSY check, we only hold * those inodes that are in the file system our caller cares * about. * * We know that ip is a valid inode in the hash chain (and thus * we can trust i_ufsvfs) because the inode we chained from * (lip) is still in the hash chain. This is true because either: * * 1. We did not drop the hash chain lock since the last * iteration (because we were not interested in the last inode), * or * 2. We maintained a hold on the last inode while we * we were processing it, so it could not be removed * from the hash chain. * * The whole reason we're dropping and re-grabbing the chain * lock on every inode is so that we don't present a major * choke point on throughput, particularly when we've been * called on behalf of fsflush. */ for (i = 0, ih = ihead; i < inohsz; i++, ih++) { mutex_enter(&ih_lock[i]); for (ip = ih->ih_chain[0], lip_held = 0; ip != (struct inode *)ih; ip = lip->i_forw) { ins.in_scan.value.ul++; /* * Undo the previous iteration's VN_HOLD(), but * only if one was done. */ if (lip_held) VN_RELE(ITOV(lip)); lip = ip; if (ufsvfsp != NULL && ip->i_ufsvfs != ufsvfsp) { /* * We're not processing all inodes, and * this inode is not in the filesystem of * interest, so skip it. No need to do a * VN_HOLD() since we're not dropping the * hash chain lock until after we've * done the i_forw traversal above. */ lip_held = 0; continue; } VN_HOLD(ITOV(ip)); lip_held = 1; mutex_exit(&ih_lock[i]); /* * Acquire the contents lock as writer to make * sure that the inode has been initialized in * the cache or removed from the idle list by * ufs_iget(). This works because ufs_iget() * acquires the contents lock before putting * the inode into the cache. If we can lock * it, then he's done with it. */ if (rwtry) { if (!rw_tryenter(&ip->i_contents, RW_WRITER)) { mutex_enter(&ih_lock[i]); continue; } } else { rw_enter(&ip->i_contents, RW_WRITER); } rw_exit(&ip->i_contents); /* * ISTALE means the inode couldn't be read * * We don't have to hold the i_contents lock * for this check for a couple of * reasons. First, if ISTALE is set then the * flag cannot be cleared until the inode is * removed from the cache and that cannot * happen until after we VN_RELE() it. * Second, if ISTALE is not set, then the * inode is in the cache and does not need to * be read from disk so ISTALE cannot be set * while we are not looking. */ if ((ip->i_flag & ISTALE) == 0) { if ((error = (*func)(ip, arg)) != 0) saverror = error; } mutex_enter(&ih_lock[i]); } if (lip_held) VN_RELE(ITOV(lip)); mutex_exit(&ih_lock[i]); } return (saverror); } /* * Mark inode with the current time, plus a unique increment. * * Since we only keep 32-bit time on disk, if UFS is still alive * beyond 2038, filesystem times will simply stick at the last * possible second of 32-bit time. Not ideal, but probably better * than going into the remote past, or confusing applications with * negative time. */ void ufs_imark(struct inode *ip) { timestruc_t now; int32_t usec, nsec; /* * The update of i_seq may have been deferred, increase i_seq here * to make sure it is in sync with the timestamps. */ if (ip->i_flag & ISEQ) { ASSERT(ip->i_flag & (IUPD|ICHG)); ip->i_seq++; ip->i_flag &= ~ISEQ; } gethrestime(&now); /* * Fast algorithm to convert nsec to usec -- see hrt2ts() * in common/os/timers.c for a full description. */ nsec = now.tv_nsec; usec = nsec + (nsec >> 2); usec = nsec + (usec >> 1); usec = nsec + (usec >> 2); usec = nsec + (usec >> 4); usec = nsec - (usec >> 3); usec = nsec + (usec >> 2); usec = nsec + (usec >> 3); usec = nsec + (usec >> 4); usec = nsec + (usec >> 1); usec = nsec + (usec >> 6); usec = usec >> 10; mutex_enter(&ufs_iuniqtime_lock); if (now.tv_sec > (time_t)iuniqtime.tv_sec || usec > iuniqtime.tv_usec) { if (now.tv_sec < TIME32_MAX) { iuniqtime.tv_sec = (time32_t)now.tv_sec; iuniqtime.tv_usec = usec; } } else { if (iuniqtime.tv_sec < TIME32_MAX) { iuniqtime.tv_usec++; /* Check for usec overflow */ if (iuniqtime.tv_usec >= MICROSEC) { iuniqtime.tv_sec++; iuniqtime.tv_usec = 0; } } } if ((ip->i_flag & IACC) && !(ip->i_ufsvfs->vfs_noatime)) { ip->i_atime = iuniqtime; } if (ip->i_flag & IUPD) { ip->i_mtime = iuniqtime; ip->i_flag |= IMODTIME; } if (ip->i_flag & ICHG) { ip->i_diroff = 0; ip->i_ctime = iuniqtime; } mutex_exit(&ufs_iuniqtime_lock); } /* * Update timestamps in inode. */ void ufs_itimes_nolock(struct inode *ip) { /* * if noatime is set and the inode access time is the only field that * must be changed, exit immediately. */ if (((ip->i_flag & (IUPD|IACC|ICHG)) == IACC) && (ip->i_ufsvfs->vfs_noatime)) { return; } if (ip->i_flag & (IUPD|IACC|ICHG)) { if (ip->i_flag & ICHG) ip->i_flag |= IMOD; else ip->i_flag |= IMODACC; ufs_imark(ip); ip->i_flag &= ~(IACC|IUPD|ICHG); } }