/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* must be AFTER ! */ #include #include #include #include #include #include #include #include #include #include #include #include extern struct vnode *common_specvp(struct vnode *vp); /* error lock status */ #define UN_ERRLCK (-1) #define SET_ERRLCK 1 #define RE_ERRLCK 2 #define NO_ERRLCK 0 /* * Index to be used in TSD for storing lockfs data */ uint_t ufs_lockfs_key; typedef struct _ulockfs_info { struct _ulockfs_info *next; struct ulockfs *ulp; uint_t flags; } ulockfs_info_t; #define ULOCK_INFO_FALLOCATE 0x00000001 /* fallocate thread */ /* * Check in TSD that whether we are already doing any VOP on this filesystem */ #define IS_REC_VOP(found, head, ulp, free) \ { \ ulockfs_info_t *_curr; \ \ for (found = 0, free = NULL, _curr = head; \ _curr != NULL; _curr = _curr->next) { \ if ((free == NULL) && \ (_curr->ulp == NULL)) \ free = _curr; \ if (_curr->ulp == ulp) { \ found = 1; \ break; \ } \ } \ } /* * Get the lockfs data from TSD so that lockfs handles the recursive VOP * properly */ #define SEARCH_ULOCKFSP(head, ulp, info) \ { \ ulockfs_info_t *_curr; \ \ for (_curr = head; _curr != NULL; \ _curr = _curr->next) { \ if (_curr->ulp == ulp) { \ break; \ } \ } \ \ info = _curr; \ } /* * Validate lockfs request */ static int ufs_getlfd( struct lockfs *lockfsp, /* new lock request */ struct lockfs *ul_lockfsp) /* old lock state */ { int error = 0; /* * no input flags defined */ if (lockfsp->lf_flags != 0) { error = EINVAL; goto errout; } /* * check key */ if (!LOCKFS_IS_ULOCK(ul_lockfsp)) if (lockfsp->lf_key != ul_lockfsp->lf_key) { error = EINVAL; goto errout; } lockfsp->lf_key = ul_lockfsp->lf_key + 1; errout: return (error); } /* * ufs_checkaccton * check if accounting is turned on on this fs */ int ufs_checkaccton(struct vnode *vp) { if (acct_fs_in_use(vp)) return (EDEADLK); return (0); } /* * ufs_checkswapon * check if local swapping is to file on this fs */ int ufs_checkswapon(struct vnode *vp) { struct swapinfo *sip; mutex_enter(&swapinfo_lock); for (sip = swapinfo; sip; sip = sip->si_next) if (sip->si_vp->v_vfsp == vp->v_vfsp) { mutex_exit(&swapinfo_lock); return (EDEADLK); } mutex_exit(&swapinfo_lock); return (0); } /* * ufs_freeze * pend future accesses for current lock and desired lock */ void ufs_freeze(struct ulockfs *ulp, struct lockfs *lockfsp) { /* * set to new lock type */ ulp->ul_lockfs.lf_lock = lockfsp->lf_lock; ulp->ul_lockfs.lf_key = lockfsp->lf_key; ulp->ul_lockfs.lf_comlen = lockfsp->lf_comlen; ulp->ul_lockfs.lf_comment = lockfsp->lf_comment; ulp->ul_fs_lock = (1 << ulp->ul_lockfs.lf_lock); } /* * All callers of ufs_quiesce() atomically increment ufs_quiesce_pend before * starting ufs_quiesce() protocol and decrement it only when a file system no * longer has to be in quiescent state. This allows ufs_pageio() to detect * that another thread wants to quiesce a file system. See more comments in * ufs_pageio(). */ ulong_t ufs_quiesce_pend = 0; /* * ufs_quiesce * wait for outstanding accesses to finish */ int ufs_quiesce(struct ulockfs *ulp) { int error = 0; ulockfs_info_t *head; ulockfs_info_t *info; head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); SEARCH_ULOCKFSP(head, ulp, info); /* * Set a softlock to suspend future ufs_vnops so that * this lockfs request will not be starved */ ULOCKFS_SET_SLOCK(ulp); ASSERT(ufs_quiesce_pend); /* check if there is any outstanding ufs vnodeops calls */ while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) { /* * use timed version of cv_wait_sig() to make sure we don't * miss a wake up call from ufs_pageio() when it doesn't use * ul_lock. * * when a fallocate thread comes in, the only way it returns * from this function is if there are no other vnode operations * going on (remember fallocate threads are tracked using * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread * hasn't already grabbed the fs write lock. */ if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp)) goto out; } if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) { error = EINTR; goto out; } } out: /* * unlock the soft lock */ ULOCKFS_CLR_SLOCK(ulp); return (error); } /* * ufs_flush_inode */ int ufs_flush_inode(struct inode *ip, void *arg) { int error; int saverror = 0; /* * wrong file system; keep looking */ if (ip->i_ufsvfs != (struct ufsvfs *)arg) return (0); /* * asynchronously push all the dirty pages */ if (((error = TRANS_SYNCIP(ip, B_ASYNC, 0, TOP_SYNCIP_FLUSHI)) != 0) && (error != EAGAIN)) saverror = error; /* * wait for io and discard all mappings */ if (error = TRANS_SYNCIP(ip, B_INVAL, 0, TOP_SYNCIP_FLUSHI)) saverror = error; if (ITOV(ip)->v_type == VDIR) { dnlc_dir_purge(&ip->i_danchor); } return (saverror); } /* * ufs_flush * Flush everything that is currently dirty; this includes invalidating * any mappings. */ int ufs_flush(struct vfs *vfsp) { int error; int saverror = 0; struct ufsvfs *ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; struct fs *fs = ufsvfsp->vfs_fs; int tdontblock = 0; ASSERT(vfs_lock_held(vfsp)); /* * purge dnlc */ (void) dnlc_purge_vfsp(vfsp, 0); /* * drain the delete and idle threads */ ufs_delete_drain(vfsp, 0, 0); ufs_idle_drain(vfsp); /* * flush and invalidate quota records */ (void) qsync(ufsvfsp); /* * flush w/invalidate the inodes for vfsp */ if (error = ufs_scan_inodes(0, ufs_flush_inode, ufsvfsp, ufsvfsp)) saverror = error; /* * synchronously flush superblock and summary info */ if (fs->fs_ronly == 0 && fs->fs_fmod) { fs->fs_fmod = 0; TRANS_SBUPDATE(ufsvfsp, vfsp, TOP_SBUPDATE_FLUSH); } /* * flush w/invalidate block device pages and buf cache */ if ((error = VOP_PUTPAGE(common_specvp(ufsvfsp->vfs_devvp), (offset_t)0, 0, B_INVAL, CRED())) > 0) saverror = error; (void) bflush((dev_t)vfsp->vfs_dev); (void) bfinval((dev_t)vfsp->vfs_dev, 0); /* * drain the delete and idle threads again */ ufs_delete_drain(vfsp, 0, 0); ufs_idle_drain(vfsp); /* * play with the clean flag */ if (saverror == 0) ufs_checkclean(vfsp); /* * Flush any outstanding transactions and roll the log * only if we are supposed to do, i.e. LDL_NOROLL not set. * We can not simply check for fs_ronly here since fsck also may * use this code to roll the log on a read-only filesystem, e.g. * root during early stages of boot, if other then a sanity check is * done, it will clear LDL_NOROLL before. * In addition we assert that the deltamap does not contain any deltas * in case LDL_NOROLL is set since this is not supposed to happen. */ if (TRANS_ISTRANS(ufsvfsp)) { ml_unit_t *ul = ufsvfsp->vfs_log; mt_map_t *mtm = ul->un_deltamap; if (ul->un_flags & LDL_NOROLL) { ASSERT(mtm->mtm_nme == 0); } else { /* * Do not set T_DONTBLOCK if there is a * transaction opened by caller. */ if (curthread->t_flag & T_DONTBLOCK) tdontblock = 1; else curthread->t_flag |= T_DONTBLOCK; TRANS_BEGIN_SYNC(ufsvfsp, TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE, error); if (!error) { TRANS_END_SYNC(ufsvfsp, saverror, TOP_COMMIT_FLUSH, TOP_COMMIT_SIZE); } if (tdontblock == 0) curthread->t_flag &= ~T_DONTBLOCK; logmap_roll_dev(ufsvfsp->vfs_log); } } return (saverror); } /* * ufs_thaw_wlock * special processing when thawing down to wlock */ static int ufs_thaw_wlock(struct inode *ip, void *arg) { /* * wrong file system; keep looking */ if (ip->i_ufsvfs != (struct ufsvfs *)arg) return (0); /* * iupdat refuses to clear flags if the fs is read only. The fs * may become read/write during the lock and we wouldn't want * these inodes being written to disk. So clear the flags. */ rw_enter(&ip->i_contents, RW_WRITER); ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); rw_exit(&ip->i_contents); /* * pages are mlocked -- fail wlock */ if (ITOV(ip)->v_type != VCHR && vn_has_cached_data(ITOV(ip))) return (EBUSY); return (0); } /* * ufs_thaw_hlock * special processing when thawing down to hlock or elock */ static int ufs_thaw_hlock(struct inode *ip, void *arg) { struct vnode *vp = ITOV(ip); /* * wrong file system; keep looking */ if (ip->i_ufsvfs != (struct ufsvfs *)arg) return (0); /* * blow away all pages - even if they are mlocked */ do { (void) TRANS_SYNCIP(ip, B_INVAL | B_FORCE, 0, TOP_SYNCIP_HLOCK); } while ((vp->v_type != VCHR) && vn_has_cached_data(vp)); rw_enter(&ip->i_contents, RW_WRITER); ip->i_flag &= ~(IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG); rw_exit(&ip->i_contents); return (0); } /* * ufs_thaw * thaw file system lock down to current value */ int ufs_thaw(struct vfs *vfsp, struct ufsvfs *ufsvfsp, struct ulockfs *ulp) { int error = 0; int noidel = (int)(ulp->ul_flag & ULOCKFS_NOIDEL); /* * if wlock or hlock or elock */ if (ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) { /* * don't keep access times * don't free deleted files * if superblock writes are allowed, limit them to me for now */ ulp->ul_flag |= (ULOCKFS_NOIACC|ULOCKFS_NOIDEL); if (ulp->ul_sbowner != (kthread_id_t)-1) ulp->ul_sbowner = curthread; /* * wait for writes for deleted files and superblock updates */ (void) ufs_flush(vfsp); /* * now make sure the quota file is up-to-date * expensive; but effective */ error = ufs_flush(vfsp); /* * no one can write the superblock */ ulp->ul_sbowner = (kthread_id_t)-1; /* * special processing for wlock/hlock/elock */ if (ULOCKFS_IS_WLOCK(ulp)) { if (error) goto errout; error = bfinval(ufsvfsp->vfs_dev, 0); if (error) goto errout; error = ufs_scan_inodes(0, ufs_thaw_wlock, (void *)ufsvfsp, ufsvfsp); if (error) goto errout; } if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp)) { error = 0; (void) ufs_scan_inodes(0, ufs_thaw_hlock, (void *)ufsvfsp, ufsvfsp); (void) bfinval(ufsvfsp->vfs_dev, 1); } } else { /* * okay to keep access times * okay to free deleted files * okay to write the superblock */ ulp->ul_flag &= ~(ULOCKFS_NOIACC|ULOCKFS_NOIDEL); ulp->ul_sbowner = NULL; /* * flush in case deleted files are in memory */ if (noidel) { if (error = ufs_flush(vfsp)) goto errout; } } errout: cv_broadcast(&ulp->ul_cv); return (error); } /* * ufs_reconcile_fs * reconcile incore superblock with ondisk superblock */ int ufs_reconcile_fs(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) { struct fs *mfs; /* in-memory superblock */ struct fs *dfs; /* on-disk superblock */ struct buf *bp; /* on-disk superblock buf */ int needs_unlock; char finished_fsclean; mfs = ufsvfsp->vfs_fs; /* * get the on-disk copy of the superblock */ bp = UFS_BREAD(ufsvfsp, vfsp->vfs_dev, SBLOCK, SBSIZE); bp->b_flags |= (B_STALE|B_AGE); if (bp->b_flags & B_ERROR) { brelse(bp); return (EIO); } dfs = bp->b_un.b_fs; /* error locks may only unlock after the fs has been made consistent */ if (errlck == UN_ERRLCK) { if (dfs->fs_clean == FSFIX) { /* being repaired */ brelse(bp); return (EAGAIN); } /* repair not yet started? */ finished_fsclean = TRANS_ISTRANS(ufsvfsp)? FSLOG: FSCLEAN; if (dfs->fs_clean != finished_fsclean) { brelse(bp); return (EBUSY); } } /* * if superblock has changed too much, abort */ if ((mfs->fs_sblkno != dfs->fs_sblkno) || (mfs->fs_cblkno != dfs->fs_cblkno) || (mfs->fs_iblkno != dfs->fs_iblkno) || (mfs->fs_dblkno != dfs->fs_dblkno) || (mfs->fs_cgoffset != dfs->fs_cgoffset) || (mfs->fs_cgmask != dfs->fs_cgmask) || (mfs->fs_bsize != dfs->fs_bsize) || (mfs->fs_fsize != dfs->fs_fsize) || (mfs->fs_frag != dfs->fs_frag) || (mfs->fs_bmask != dfs->fs_bmask) || (mfs->fs_fmask != dfs->fs_fmask) || (mfs->fs_bshift != dfs->fs_bshift) || (mfs->fs_fshift != dfs->fs_fshift) || (mfs->fs_fragshift != dfs->fs_fragshift) || (mfs->fs_fsbtodb != dfs->fs_fsbtodb) || (mfs->fs_sbsize != dfs->fs_sbsize) || (mfs->fs_nindir != dfs->fs_nindir) || (mfs->fs_nspf != dfs->fs_nspf) || (mfs->fs_trackskew != dfs->fs_trackskew) || (mfs->fs_cgsize != dfs->fs_cgsize) || (mfs->fs_ntrak != dfs->fs_ntrak) || (mfs->fs_nsect != dfs->fs_nsect) || (mfs->fs_spc != dfs->fs_spc) || (mfs->fs_cpg != dfs->fs_cpg) || (mfs->fs_ipg != dfs->fs_ipg) || (mfs->fs_fpg != dfs->fs_fpg) || (mfs->fs_postblformat != dfs->fs_postblformat) || (mfs->fs_magic != dfs->fs_magic)) { brelse(bp); return (EACCES); } if (dfs->fs_clean == FSBAD || FSOKAY != dfs->fs_state + dfs->fs_time) if (mfs->fs_clean == FSLOG) { brelse(bp); return (EACCES); } /* * get new summary info */ if (ufs_getsummaryinfo(vfsp->vfs_dev, ufsvfsp, dfs)) { brelse(bp); return (EIO); } /* * release old summary info and update in-memory superblock */ kmem_free(mfs->fs_u.fs_csp, mfs->fs_cssize); mfs->fs_u.fs_csp = dfs->fs_u.fs_csp; /* Only entry 0 used */ /* * update fields allowed to change */ mfs->fs_size = dfs->fs_size; mfs->fs_dsize = dfs->fs_dsize; mfs->fs_ncg = dfs->fs_ncg; mfs->fs_minfree = dfs->fs_minfree; mfs->fs_rotdelay = dfs->fs_rotdelay; mfs->fs_rps = dfs->fs_rps; mfs->fs_maxcontig = dfs->fs_maxcontig; mfs->fs_maxbpg = dfs->fs_maxbpg; mfs->fs_csmask = dfs->fs_csmask; mfs->fs_csshift = dfs->fs_csshift; mfs->fs_optim = dfs->fs_optim; mfs->fs_csaddr = dfs->fs_csaddr; mfs->fs_cssize = dfs->fs_cssize; mfs->fs_ncyl = dfs->fs_ncyl; mfs->fs_cstotal = dfs->fs_cstotal; mfs->fs_reclaim = dfs->fs_reclaim; if (mfs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) { mfs->fs_reclaim &= ~FS_RECLAIM; mfs->fs_reclaim |= FS_RECLAIMING; ufs_thread_start(&ufsvfsp->vfs_reclaim, ufs_thread_reclaim, vfsp); } /* XXX What to do about sparecon? */ /* XXX need to copy volume label */ /* * ondisk clean flag overrides inmemory clean flag iff == FSBAD * or if error-locked and ondisk is now clean */ needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); if (needs_unlock) mutex_enter(&ufsvfsp->vfs_lock); if (errlck == UN_ERRLCK) { if (finished_fsclean == dfs->fs_clean) mfs->fs_clean = finished_fsclean; else mfs->fs_clean = FSBAD; mfs->fs_state = FSOKAY - dfs->fs_time; } if (FSOKAY != dfs->fs_state + dfs->fs_time || (dfs->fs_clean == FSBAD)) mfs->fs_clean = FSBAD; if (needs_unlock) mutex_exit(&ufsvfsp->vfs_lock); brelse(bp); return (0); } /* * ufs_reconcile_inode * reconcile ondisk inode with incore inode */ static int ufs_reconcile_inode(struct inode *ip, void *arg) { int i; int ndaddr; int niaddr; struct dinode *dp; /* ondisk inode */ struct buf *bp = NULL; uid_t d_uid; gid_t d_gid; int error = 0; struct fs *fs; /* * not an inode we care about */ if (ip->i_ufsvfs != (struct ufsvfs *)arg) return (0); fs = ip->i_fs; /* * Inode reconciliation fails: we made the filesystem quiescent * and we did a ufs_flush() before calling ufs_reconcile_inode() * and thus the inode should not have been changed inbetween. * Any discrepancies indicate a logic error and a pretty * significant run-state inconsistency we should complain about. */ if (ip->i_flag & (IMOD|IMODACC|IACC|IUPD|ICHG|IATTCHG)) { cmn_err(CE_WARN, "%s: Inode reconciliation failed for" "inode %llu", fs->fs_fsmnt, (u_longlong_t)ip->i_number); return (EINVAL); } /* * get the dinode */ bp = UFS_BREAD(ip->i_ufsvfs, ip->i_dev, (daddr_t)fsbtodb(fs, itod(fs, ip->i_number)), (int)fs->fs_bsize); if (bp->b_flags & B_ERROR) { brelse(bp); return (EIO); } dp = bp->b_un.b_dino; dp += itoo(fs, ip->i_number); /* * handle Sun's implementation of EFT */ d_uid = (dp->di_suid == UID_LONG) ? dp->di_uid : (uid_t)dp->di_suid; d_gid = (dp->di_sgid == GID_LONG) ? dp->di_gid : (uid_t)dp->di_sgid; rw_enter(&ip->i_contents, RW_WRITER); /* * some fields are not allowed to change */ if ((ip->i_mode != dp->di_mode) || (ip->i_gen != dp->di_gen) || (ip->i_uid != d_uid) || (ip->i_gid != d_gid)) { error = EACCES; goto out; } /* * and some are allowed to change */ ip->i_size = dp->di_size; ip->i_ic.ic_flags = dp->di_ic.ic_flags; ip->i_blocks = dp->di_blocks; ip->i_nlink = dp->di_nlink; if (ip->i_flag & IFASTSYMLNK) { ndaddr = 1; niaddr = 0; } else { ndaddr = NDADDR; niaddr = NIADDR; } for (i = 0; i < ndaddr; ++i) ip->i_db[i] = dp->di_db[i]; for (i = 0; i < niaddr; ++i) ip->i_ib[i] = dp->di_ib[i]; out: rw_exit(&ip->i_contents); brelse(bp); return (error); } /* * ufs_reconcile * reconcile ondisk superblock/inodes with any incore */ static int ufs_reconcile(struct vfs *vfsp, struct ufsvfs *ufsvfsp, int errlck) { int error = 0; /* * get rid of as much inmemory data as possible */ (void) ufs_flush(vfsp); /* * reconcile the superblock and inodes */ if (error = ufs_reconcile_fs(vfsp, ufsvfsp, errlck)) return (error); if (error = ufs_scan_inodes(0, ufs_reconcile_inode, ufsvfsp, ufsvfsp)) return (error); /* * allocation blocks may be incorrect; get rid of them */ (void) ufs_flush(vfsp); return (error); } /* * File system locking */ int ufs_fiolfs(struct vnode *vp, struct lockfs *lockfsp, int from_log) { return (ufs__fiolfs(vp, lockfsp, /* from_user */ 1, from_log)); } /* kernel-internal interface, also used by fix-on-panic */ int ufs__fiolfs( struct vnode *vp, struct lockfs *lockfsp, int from_user, int from_log) { struct ulockfs *ulp; struct lockfs lfs; int error; struct vfs *vfsp; struct ufsvfs *ufsvfsp; int errlck = NO_ERRLCK; int poll_events = POLLPRI; extern struct pollhead ufs_pollhd; ulockfs_info_t *head; ulockfs_info_t *info; int signal = 0; /* check valid lock type */ if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK) return (EINVAL); if (!vp || !vp->v_vfsp || !vp->v_vfsp->vfs_data) return (EIO); vfsp = vp->v_vfsp; ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; ulp = &ufsvfsp->vfs_ulockfs; head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); SEARCH_ULOCKFSP(head, ulp, info); /* * Suspend both the reclaim thread and the delete thread. * This must be done outside the lockfs locking protocol. */ ufs_thread_suspend(&ufsvfsp->vfs_reclaim); ufs_thread_suspend(&ufsvfsp->vfs_delete); /* * Acquire vfs_reflock around ul_lock to avoid deadlock with * umount/remount/sync. */ vfs_lock_wait(vfsp); mutex_enter(&ulp->ul_lock); atomic_add_long(&ufs_quiesce_pend, 1); /* * Quit if there is another lockfs request in progress * that is waiting for existing ufs_vnops to complete. */ if (ULOCKFS_IS_BUSY(ulp)) { error = EBUSY; goto errexit; } /* cannot ulocked or downgrade a hard-lock */ if (ULOCKFS_IS_HLOCK(ulp)) { error = EIO; goto errexit; } /* an error lock may be unlocked or relocked, only */ if (ULOCKFS_IS_ELOCK(ulp)) { if (!LOCKFS_IS_ULOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { error = EBUSY; goto errexit; } } /* * a read-only error lock may only be upgraded to an * error lock or hard lock */ if (ULOCKFS_IS_ROELOCK(ulp)) { if (!LOCKFS_IS_HLOCK(lockfsp) && !LOCKFS_IS_ELOCK(lockfsp)) { error = EBUSY; goto errexit; } } /* * until read-only error locks are fully implemented * just return EINVAL */ if (LOCKFS_IS_ROELOCK(lockfsp)) { error = EINVAL; goto errexit; } /* * an error lock may only be applied if the file system is * unlocked or already error locked. * (this is to prevent the case where a fs gets changed out from * underneath a fs that is locked for backup, * that is, name/delete/write-locked.) */ if ((!ULOCKFS_IS_ULOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp) && !ULOCKFS_IS_ROELOCK(ulp)) && (LOCKFS_IS_ELOCK(lockfsp) || LOCKFS_IS_ROELOCK(lockfsp))) { error = EBUSY; goto errexit; } /* get and validate the input lockfs request */ if (error = ufs_getlfd(lockfsp, &ulp->ul_lockfs)) goto errexit; /* * save current ulockfs struct */ bcopy(&ulp->ul_lockfs, &lfs, sizeof (struct lockfs)); /* * Freeze the file system (pend future accesses) */ ufs_freeze(ulp, lockfsp); /* * Set locking in progress because ufs_quiesce may free the * ul_lock mutex. */ ULOCKFS_SET_BUSY(ulp); /* update the ioctl copy */ LOCKFS_SET_BUSY(&ulp->ul_lockfs); /* * We need to unset FWLOCK status before we call ufs_quiesce * so that the thread doesnt get suspended. We do this only if * this (fallocate) thread requested an unlock operation. */ if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { if (!ULOCKFS_IS_WLOCK(ulp)) ULOCKFS_CLR_FWLOCK(ulp); } /* * Quiesce (wait for outstanding accesses to finish) */ if (error = ufs_quiesce(ulp)) { /* * Interrupted due to signal. There could still be * pending vnops. */ signal = 1; /* * We do broadcast because lock-status * could be reverted to old status. */ cv_broadcast(&ulp->ul_cv); goto errout; } /* * If the fallocate thread requested a write fs lock operation * then we set fwlock status in the ulp. */ if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { if (ULOCKFS_IS_WLOCK(ulp)) ULOCKFS_SET_FWLOCK(ulp); } /* * can't wlock or (ro)elock fs with accounting or local swap file */ if ((ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) && !from_log) { if (error = ufs_checkaccton(vp)) goto errout; if (error = ufs_checkswapon(vp)) goto errout; } /* * save error lock status to pass down to reconcilation * routines and for later cleanup */ if (LOCKFS_IS_ELOCK(&lfs) && ULOCKFS_IS_ULOCK(ulp)) errlck = UN_ERRLCK; if (ULOCKFS_IS_ELOCK(ulp) || ULOCKFS_IS_ROELOCK(ulp)) { int needs_unlock; int needs_sbwrite; poll_events |= POLLERR; errlck = LOCKFS_IS_ELOCK(&lfs) || LOCKFS_IS_ROELOCK(&lfs) ? RE_ERRLCK : SET_ERRLCK; needs_unlock = !MUTEX_HELD(&ufsvfsp->vfs_lock); if (needs_unlock) mutex_enter(&ufsvfsp->vfs_lock); /* disable delayed i/o */ needs_sbwrite = 0; if (errlck == SET_ERRLCK) { ufsvfsp->vfs_fs->fs_clean = FSBAD; needs_sbwrite = 1; } needs_sbwrite |= ufsvfsp->vfs_dio; ufsvfsp->vfs_dio = 0; if (needs_unlock) mutex_exit(&ufsvfsp->vfs_lock); if (needs_sbwrite) { ulp->ul_sbowner = curthread; TRANS_SBWRITE(ufsvfsp, TOP_SBWRITE_STABLE); if (needs_unlock) mutex_enter(&ufsvfsp->vfs_lock); ufsvfsp->vfs_fs->fs_fmod = 0; if (needs_unlock) mutex_exit(&ufsvfsp->vfs_lock); } } /* * reconcile superblock and inodes if was wlocked */ if (LOCKFS_IS_WLOCK(&lfs) || LOCKFS_IS_ELOCK(&lfs)) { if (error = ufs_reconcile(vfsp, ufsvfsp, errlck)) goto errout; /* * in case the fs grew; reset the metadata map for logging tests */ TRANS_MATA_UMOUNT(ufsvfsp); TRANS_MATA_MOUNT(ufsvfsp); TRANS_MATA_SI(ufsvfsp, ufsvfsp->vfs_fs); } /* * At least everything *currently* dirty goes out. */ if ((error = ufs_flush(vfsp)) != 0 && !ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp)) goto errout; /* * thaw file system and wakeup pended processes */ if (error = ufs_thaw(vfsp, ufsvfsp, ulp)) if (!ULOCKFS_IS_HLOCK(ulp) && !ULOCKFS_IS_ELOCK(ulp)) goto errout; /* * reset modified flag if not already write locked */ if (!LOCKFS_IS_WLOCK(&lfs)) ULOCKFS_CLR_MOD(ulp); /* * idle the lock struct */ ULOCKFS_CLR_BUSY(ulp); /* update the ioctl copy */ LOCKFS_CLR_BUSY(&ulp->ul_lockfs); /* * free current comment */ if (lfs.lf_comment && lfs.lf_comlen != 0) { kmem_free(lfs.lf_comment, lfs.lf_comlen); lfs.lf_comment = NULL; lfs.lf_comlen = 0; } /* do error lock cleanup */ if (errlck == UN_ERRLCK) ufsfx_unlockfs(ufsvfsp); else if (errlck == RE_ERRLCK) ufsfx_lockfs(ufsvfsp); /* don't allow error lock from user to invoke panic */ else if (from_user && errlck == SET_ERRLCK && !(ufsvfsp->vfs_fsfx.fx_flags & (UFSMNT_ONERROR_PANIC >> 4))) (void) ufs_fault(ufsvfsp->vfs_root, ulp->ul_lockfs.lf_comment && ulp->ul_lockfs.lf_comlen > 0 ? ulp->ul_lockfs.lf_comment: "user-applied error lock"); atomic_add_long(&ufs_quiesce_pend, -1); mutex_exit(&ulp->ul_lock); vfs_unlock(vfsp); if (ULOCKFS_IS_HLOCK(&ufsvfsp->vfs_ulockfs)) poll_events |= POLLERR; pollwakeup(&ufs_pollhd, poll_events); /* * Allow both the delete thread and the reclaim thread to * continue. */ ufs_thread_continue(&ufsvfsp->vfs_delete); ufs_thread_continue(&ufsvfsp->vfs_reclaim); return (0); errout: /* * Lock failed. Reset the old lock in ufsvfs if not hard locked. */ if (!LOCKFS_IS_HLOCK(&ulp->ul_lockfs)) { bcopy(&lfs, &ulp->ul_lockfs, sizeof (struct lockfs)); ulp->ul_fs_lock = (1 << lfs.lf_lock); } /* * Don't call ufs_thaw() when there's a signal during * ufs quiesce operation as it can lead to deadlock * with getpage. */ if (signal == 0) (void) ufs_thaw(vfsp, ufsvfsp, ulp); ULOCKFS_CLR_BUSY(ulp); LOCKFS_CLR_BUSY(&ulp->ul_lockfs); errexit: atomic_add_long(&ufs_quiesce_pend, -1); mutex_exit(&ulp->ul_lock); vfs_unlock(vfsp); /* * Allow both the delete thread and the reclaim thread to * continue. */ ufs_thread_continue(&ufsvfsp->vfs_delete); ufs_thread_continue(&ufsvfsp->vfs_reclaim); return (error); } /* * fiolfss * return the current file system locking state info */ int ufs_fiolfss(struct vnode *vp, struct lockfs *lockfsp) { struct ulockfs *ulp; if (!vp || !vp->v_vfsp || !VTOI(vp)) return (EINVAL); /* file system has been forcibly unmounted */ if (VTOI(vp)->i_ufsvfs == NULL) return (EIO); ulp = VTOUL(vp); if (ULOCKFS_IS_HLOCK(ulp)) { *lockfsp = ulp->ul_lockfs; /* structure assignment */ return (0); } mutex_enter(&ulp->ul_lock); *lockfsp = ulp->ul_lockfs; /* structure assignment */ if (ULOCKFS_IS_MOD(ulp)) lockfsp->lf_flags |= LOCKFS_MOD; mutex_exit(&ulp->ul_lock); return (0); } /* * ufs_check_lockfs * check whether a ufs_vnops conflicts with the file system lock */ int ufs_check_lockfs(struct ufsvfs *ufsvfsp, struct ulockfs *ulp, ulong_t mask) { k_sigset_t smask; int sig, slock; ASSERT(MUTEX_HELD(&ulp->ul_lock)); while (ulp->ul_fs_lock & mask) { slock = (int)ULOCKFS_IS_SLOCK(ulp); if ((curthread->t_flag & T_DONTPEND) && !slock) { curthread->t_flag |= T_WOULDBLOCK; return (EAGAIN); } curthread->t_flag &= ~T_WOULDBLOCK; /* * In the case of an onerr umount of the fs, threads could * have blocked before coming into ufs_check_lockfs and * need to check for the special case of ELOCK and * vfs_dontblock being set which would indicate that the fs * is on its way out and will not return therefore making * EIO the appropriate response. */ if (ULOCKFS_IS_HLOCK(ulp) || (ULOCKFS_IS_ELOCK(ulp) && ufsvfsp->vfs_dontblock)) return (EIO); /* * wait for lock status to change */ if (slock || ufsvfsp->vfs_nointr) { cv_wait(&ulp->ul_cv, &ulp->ul_lock); } else { sigintr(&smask, 1); sig = cv_wait_sig(&ulp->ul_cv, &ulp->ul_lock); sigunintr(&smask); if ((!sig && (ulp->ul_fs_lock & mask)) || ufsvfsp->vfs_dontblock) return (EINTR); } } if (mask & ULOCKFS_FWLOCK) { atomic_add_long(&ulp->ul_falloc_cnt, 1); ULOCKFS_SET_FALLOC(ulp); } else { atomic_add_long(&ulp->ul_vnops_cnt, 1); } return (0); } /* * Check whether we came across the handcrafted lockfs protocol path. We can't * simply check for T_DONTBLOCK here as one would assume since this can also * falsely catch recursive VOP's going to a different filesystem, instead we * check if we already hold the ulockfs->ul_lock mutex. */ static int ufs_lockfs_is_under_rawlockfs(struct ulockfs *ulp) { return ((mutex_owner(&ulp->ul_lock) != curthread) ? 0 : 1); } /* * ufs_lockfs_begin - start the lockfs locking protocol */ int ufs_lockfs_begin(struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, ulong_t mask) { int error; int rec_vop; struct ulockfs *ulp; ulockfs_info_t *ulockfs_info; ulockfs_info_t *ulockfs_info_free; ulockfs_info_t *ulockfs_info_temp; /* * file system has been forcibly unmounted */ if (ufsvfsp == NULL) return (EIO); *ulpp = ulp = &ufsvfsp->vfs_ulockfs; /* * Do lockfs protocol */ ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); /* * Detect recursive VOP call or handcrafted internal lockfs protocol * path and bail out in that case. */ if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { *ulpp = NULL; return (0); } else { if (ulockfs_info_free == NULL) { if ((ulockfs_info_temp = (ulockfs_info_t *) kmem_zalloc(sizeof (ulockfs_info_t), KM_NOSLEEP)) == NULL) { *ulpp = NULL; return (ENOMEM); } } } /* * First time VOP call */ mutex_enter(&ulp->ul_lock); if (ULOCKFS_IS_JUSTULOCK(ulp)) { if (mask & ULOCKFS_FWLOCK) { atomic_add_long(&ulp->ul_falloc_cnt, 1); ULOCKFS_SET_FALLOC(ulp); } else { atomic_add_long(&ulp->ul_vnops_cnt, 1); } } else { if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) { mutex_exit(&ulp->ul_lock); if (ulockfs_info_free == NULL) kmem_free(ulockfs_info_temp, sizeof (ulockfs_info_t)); return (error); } } mutex_exit(&ulp->ul_lock); if (ulockfs_info_free != NULL) { ulockfs_info_free->ulp = ulp; if (mask & ULOCKFS_FWLOCK) ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; } else { ulockfs_info_temp->ulp = ulp; ulockfs_info_temp->next = ulockfs_info; if (mask & ULOCKFS_FWLOCK) ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; ASSERT(ufs_lockfs_key != 0); (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); } curthread->t_flag |= T_DONTBLOCK; return (0); } /* * Check whether we are returning from the top level VOP. */ static int ufs_lockfs_top_vop_return(ulockfs_info_t *head) { ulockfs_info_t *info; int result = 1; for (info = head; info != NULL; info = info->next) { if (info->ulp != NULL) { result = 0; break; } } return (result); } /* * ufs_lockfs_end - terminate the lockfs locking protocol */ void ufs_lockfs_end(struct ulockfs *ulp) { ulockfs_info_t *info; ulockfs_info_t *head; /* * end-of-VOP protocol */ if (ulp == NULL) return; head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); SEARCH_ULOCKFSP(head, ulp, info); /* * If we're called from a first level VOP, we have to have a * valid ulockfs record in the TSD. */ ASSERT(info != NULL); /* * Invalidate the ulockfs record. */ info->ulp = NULL; if (ufs_lockfs_top_vop_return(head)) curthread->t_flag &= ~T_DONTBLOCK; mutex_enter(&ulp->ul_lock); /* fallocate thread */ if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) { if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) ULOCKFS_CLR_FALLOC(ulp); } else { /* normal thread */ if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) cv_broadcast(&ulp->ul_cv); } /* Clear the thread's fallocate state */ if (info->flags & ULOCK_INFO_FALLOCATE) info->flags &= ~ULOCK_INFO_FALLOCATE; if (ulp->ul_vnops_cnt == 0 && ulp->ul_falloc_cnt) cv_broadcast(&ulp->ul_cv); mutex_exit(&ulp->ul_lock); } /* * specialized version of ufs_lockfs_begin() called by ufs_getpage(). */ int ufs_lockfs_begin_getpage( struct ufsvfs *ufsvfsp, struct ulockfs **ulpp, struct seg *seg, int read_access, uint_t *protp) { ulong_t mask; int error; int rec_vop; struct ulockfs *ulp; ulockfs_info_t *ulockfs_info; ulockfs_info_t *ulockfs_info_free; ulockfs_info_t *ulockfs_info_temp; /* * file system has been forcibly unmounted */ if (ufsvfsp == NULL) return (EIO); *ulpp = ulp = &ufsvfsp->vfs_ulockfs; /* * Do lockfs protocol */ ulockfs_info = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); IS_REC_VOP(rec_vop, ulockfs_info, ulp, ulockfs_info_free); /* * Detect recursive VOP call or handcrafted internal lockfs protocol * path and bail out in that case. */ if (rec_vop || ufs_lockfs_is_under_rawlockfs(ulp)) { *ulpp = NULL; return (0); } else { if (ulockfs_info_free == NULL) { if ((ulockfs_info_temp = (ulockfs_info_t *) kmem_zalloc(sizeof (ulockfs_info_t), KM_NOSLEEP)) == NULL) { *ulpp = NULL; return (ENOMEM); } } } /* * First time VOP call */ mutex_enter(&ulp->ul_lock); if (ULOCKFS_IS_JUSTULOCK(ulp)) /* * fs is not locked, simply inc the active-ops counter */ atomic_add_long(&ulp->ul_vnops_cnt, 1); else { if (seg->s_ops == &segvn_ops && ((struct segvn_data *)seg->s_data)->type != MAP_SHARED) { mask = (ulong_t)ULOCKFS_GETREAD_MASK; } else if (protp && read_access) { /* * Restrict the mapping to readonly. * Writes to this mapping will cause * another fault which will then * be suspended if fs is write locked */ *protp &= ~PROT_WRITE; mask = (ulong_t)ULOCKFS_GETREAD_MASK; } else mask = (ulong_t)ULOCKFS_GETWRITE_MASK; /* * will sleep if this fs is locked against this VOP */ if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) { mutex_exit(&ulp->ul_lock); if (ulockfs_info_free == NULL) kmem_free(ulockfs_info_temp, sizeof (ulockfs_info_t)); return (error); } } mutex_exit(&ulp->ul_lock); if (ulockfs_info_free != NULL) { ulockfs_info_free->ulp = ulp; } else { ulockfs_info_temp->ulp = ulp; ulockfs_info_temp->next = ulockfs_info; ASSERT(ufs_lockfs_key != 0); (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); } curthread->t_flag |= T_DONTBLOCK; return (0); } void ufs_lockfs_tsd_destructor(void *head) { ulockfs_info_t *curr = (ulockfs_info_t *)head; ulockfs_info_t *temp; for (; curr != NULL; ) { /* * The TSD destructor is being called when the thread exits * (via thread_exit()). At that time it must have cleaned up * all VOPs via ufs_lockfs_end() and there must not be a * valid ulockfs record exist while a thread is exiting. */ temp = curr; curr = curr->next; ASSERT(temp->ulp == NULL); kmem_free(temp, sizeof (ulockfs_info_t)); } }