/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* * University Copyright- Copyright (c) 1982, 1986, 1988 * The Regents of the University of California * All Rights Reserved * * University Acknowledgment- Portions of this document are derived from * software developed by the University of California, Berkeley, and its * contributors. */ #pragma ident "%Z%%M% %I% %E% SMI" /* * Directory manipulation routines. * * When manipulating directories, the i_rwlock provides serialization * since directories cannot be mmapped. The i_contents lock is redundant. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ */ #if !ISP2(DIRBLKSIZ) #error "DIRBLKSIZ not a power of 2" #endif /* * A virgin directory. */ static struct dirtemplate mastertemplate = { 0, 12, 1, ".", 0, DIRBLKSIZ - 12, 2, ".." }; #define LDIRSIZ(len) \ ((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3)) #define MAX_DIR_NAME_LEN(len) \ (((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1) /* * The dnlc directory cache allows a 64 bit handle for directory entries. * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset * into the handle. Note, a 32 bit offset allows a 4GB directory, which * is way beyond what could be cached in memory by the directory * caching routines. So we are quite safe with this limit. * The macros below pack and unpack the handle. */ #define H_TO_INO(h) (uint32_t)((h) & UINT_MAX) #define H_TO_OFF(h) (off_t)((h) >> 32) #define INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino)) /* * The average size of a typical on disk directory entry is about 16 bytes * and so defines AV_DIRECT_SHIFT : log2(16) * This define is only used to approximate the number of entries * is a directory. This is needed for dnlc_dir_start() which will immediately * return an error if the value is not within its acceptable range of * number of files in a directory. */ #define AV_DIRECT_SHIFT 4 /* * If the directory size (from i_size) is greater than the ufs_min_dir_cache * tunable then we request dnlc directory caching. * This has found to be profitable after 1024 file names. */ int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT; /* The time point the dnlc directory caching was disabled */ static hrtime_t ufs_dc_disable_at; /* directory caching disable duration */ static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5; #ifdef DEBUG int dirchk = 1; #else /* !DEBUG */ int dirchk = 0; #endif /* DEBUG */ int ufs_negative_cache = 1; uint64_t ufs_dirremove_retry_cnt; static void dirbad(); static int ufs_dirrename(); static int ufs_diraddentry(); static int ufs_dirempty(); static int ufs_dirscan(); static int ufs_dirclrdotdot(); static int ufs_dirfixdotdot(); static int ufs_dirpurgedotdot(); static int dirprepareentry(); static int ufs_dirmakedirect(); static int dirbadname(); static int dirmangled(); /* * Look for a given name in a directory. On successful return, *ipp * will point to the VN_HELD inode. */ int ufs_dirlook( struct inode *dp, char *namep, struct inode **ipp, struct cred *cr, int skipdnlc) /* skip the 1st level dnlc */ { uint64_t handle; struct fbuf *fbp; /* a buffer of directory entries */ struct direct *ep; /* the current directory entry */ struct vnode *vp; struct vnode *dvp; /* directory vnode ptr */ dcanchor_t *dcap; off_t endsearch; /* offset to end directory search */ off_t offset; off_t start_off; /* starting offset from middle search */ off_t last_offset; /* last offset */ int entryoffsetinblock; /* offset of ep in addr's buffer */ int numdirpasses; /* strategy for directory search */ int namlen; /* length of name */ int err; int doingchk; int i; int caching; ino_t ep_ino; /* entry i number */ ino_t chkino; ushort_t ep_reclen; /* direct local d_reclen */ ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */ /* * Check accessibility of directory. */ if (((dp->i_mode & IFMT) != IFDIR) && ((dp->i_mode & IFMT) != IFATTRDIR)) return (ENOTDIR); if (err = ufs_iaccess(dp, IEXEC, cr)) return (err); /* * Check the directory name lookup cache, first for individual files * then for complete directories. */ dvp = ITOV(dp); if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) { /* vp is already held from dnlc_lookup */ if (vp == DNLC_NO_VNODE) { VN_RELE(vp); return (ENOENT); } *ipp = VTOI(vp); return (0); } dcap = &dp->i_danchor; /* * Grab the reader lock on the directory data before checking * the dnlc to avoid a race with ufs_dirremove() & friends. */ rw_enter(&dp->i_rwlock, RW_READER); switch (dnlc_dir_lookup(dcap, namep, &handle)) { case DFOUND: ep_ino = (ino_t)H_TO_INO(handle); if (dp->i_number == ep_ino) { VN_HOLD(dvp); /* want ourself, "." */ *ipp = dp; rw_exit(&dp->i_rwlock); return (0); } if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) { uint64_t handle2; /* * release the lock on the dir we are searching * to avoid a deadlock when grabbing the * i_contents lock in ufs_iget_alloced(). */ rw_exit(&dp->i_rwlock); rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); /* * must recheck as we dropped dp->i_rwlock */ rw_enter(&dp->i_rwlock, RW_READER); if (!err && (dnlc_dir_lookup(dcap, namep, &handle2) == DFOUND) && (handle == handle2)) { dnlc_update(dvp, namep, ITOV(*ipp)); rw_exit(&dp->i_rwlock); return (0); } /* check failed, read the actual directory */ if (!err) { VN_RELE(ITOV(*ipp)); } goto restart; } /* usual case of not "." nor ".." */ rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); if (err) { rw_exit(&dp->i_rwlock); return (err); } dnlc_update(dvp, namep, ITOV(*ipp)); rw_exit(&dp->i_rwlock); return (0); case DNOENT: if (ufs_negative_cache && (dp->i_nlink > 0)) { dnlc_enter(dvp, namep, DNLC_NO_VNODE); } rw_exit(&dp->i_rwlock); return (ENOENT); default: break; } restart: fbp = NULL; doingchk = 0; chkino = 0; caching = 0; /* * Attempt to cache any directories greater than the tunable * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM), * disable caching for this directory and record the system time. * Any attempt after the disable time has expired will enable * the caching again. */ if (dp->i_size >= ufs_min_dir_cache) { /* * if the directory caching disable time has expired * enable the caching again. */ if (dp->i_cachedir == CD_DISABLED_NOMEM && gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { ufs_dc_disable_at = 0; dp->i_cachedir = CD_ENABLED; } if (dp->i_cachedir == CD_ENABLED) { switch (dnlc_dir_start(dcap, dp->i_size >> AV_DIRECT_SHIFT)) { case DNOMEM: dp->i_cachedir = CD_DISABLED_NOMEM; ufs_dc_disable_at = gethrtime(); break; case DTOOBIG: dp->i_cachedir = CD_DISABLED_TOOBIG; break; case DOK: caching = 1; break; default: break; } } } /* * If caching we don't stop when the file has been * found, but need to know later, so clear *ipp now */ *ipp = NULL; recheck: if (caching) { offset = 0; entryoffsetinblock = 0; numdirpasses = 1; } else { /* * Take care to look at dp->i_diroff only once, as it * may be changing due to other threads/cpus. */ offset = dp->i_diroff; if (offset > dp->i_size) { offset = 0; } if (offset == 0) { entryoffsetinblock = 0; numdirpasses = 1; } else { start_off = offset; entryoffsetinblock = blkoff(dp->i_fs, offset); if (entryoffsetinblock != 0) { err = blkatoff(dp, offset, (char **)0, &fbp); if (err) goto bad; } numdirpasses = 2; } } endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t); namlen = strlen(namep); last_offset = 0; searchloop: while (offset < endsearch) { /* * If offset is on a block boundary, * read the next directory block. * Release previous if it exists. */ if (blkoff(dp->i_fs, offset) == 0) { if (fbp != NULL) { fbrelse(fbp, S_OTHER); } err = blkatoff(dp, offset, (char **)0, &fbp); if (err) goto bad; entryoffsetinblock = 0; } /* * If the offset to the next entry is invalid or if the * next entry is a zero length record or if the record * length is invalid, then skip to the next directory * block. Complete validation checks are done if the * record length is invalid. * * Full validation checks are slow so they are disabled * by default. Complete checks can be run by patching * "dirchk" to be true. * * We have to check the validity of entryoffsetinblock * here because it can be set to i_diroff above. */ ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock); if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 || (dirchk || (ep->d_reclen & 0x3)) && dirmangled(dp, ep, entryoffsetinblock, offset)) { i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); offset += i; entryoffsetinblock += i; if (caching) { dnlc_dir_purge(dcap); caching = 0; } continue; } ep_reclen = ep->d_reclen; /* * Add named entries and free space into the directory cache */ if (caching) { ushort_t extra; off_t off2; if (ep->d_ino == 0) { extra = ep_reclen; if (offset & (DIRBLKSIZ - 1)) { dnlc_dir_purge(dcap); dp->i_cachedir = CD_DISABLED; caching = 0; } } else { /* * entries hold the previous offset except the * 1st which holds the offset + 1 */ if (offset & (DIRBLKSIZ - 1)) { off2 = last_offset; } else { off2 = offset + 1; } caching = (dnlc_dir_add_entry(dcap, ep->d_name, INO_OFF_TO_H(ep->d_ino, off2)) == DOK); extra = ep_reclen - DIRSIZ(ep); } if (caching && (extra >= LDIRSIZ(1))) { caching = (dnlc_dir_add_space(dcap, extra, (uint64_t)offset) == DOK); } } /* * Check for a name match. * We have the parent inode read locked with i_rwlock. */ if (ep->d_ino && ep->d_namlen == namlen && *namep == *ep->d_name && /* fast chk 1st chr */ bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) { /* * We have to release the fbp early here to avoid * a possible deadlock situation where we have the * fbp and want the directory inode and someone doing * a ufs_direnter_* has the directory inode and wants * the fbp. XXX - is this still needed? */ ep_ino = (ino_t)ep->d_ino; ASSERT(fbp != NULL); fbrelse(fbp, S_OTHER); fbp = NULL; /* * Atomic update (read lock held) */ dp->i_diroff = offset; if (namlen == 2 && namep[0] == '.' && namep[1] == '.') { struct timeval32 omtime; if (caching) { dnlc_dir_purge(dcap); caching = 0; } if (doingchk) { /* * if the inumber didn't change * continue with already found inode. */ if (ep_ino == chkino) goto checkok; else { VN_RELE(ITOV(*ipp)); /* *ipp is nulled at restart */ goto restart; } } /* * release the lock on the dir we are searching * to avoid a deadlock when grabbing the * i_contents lock in ufs_iget_alloced(). */ omtime = dp->i_mtime; rw_exit(&dp->i_rwlock); rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); rw_enter(&dp->i_rwlock, RW_READER); if (err) goto bad; /* * Since we released the lock on the directory, * we must check that the same inode is still * the ".." entry for this directory. */ /*CSTYLED*/ if (timercmp(&omtime, &dp->i_mtime, !=)) { /* * Modification time changed on the * directory, we must go check if * the inumber changed for ".." */ doingchk = 1; chkino = ep_ino; entryoffsetinblock = 0; if (caching) { /* * Forget directory caching * for this rare case */ dnlc_dir_purge(dcap); caching = 0; } goto recheck; } } else if (dp->i_number == ep_ino) { VN_HOLD(dvp); /* want ourself, "." */ *ipp = dp; if (caching) { dnlc_dir_purge(dcap); caching = 0; } } else { rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr); rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); if (err) goto bad; } checkok: ASSERT(*ipp); dnlc_update(dvp, namep, ITOV(*ipp)); /* * If we are not caching then just return the entry * otherwise complete loading up the cache */ if (!caching) { rw_exit(&dp->i_rwlock); return (0); } err = blkatoff(dp, offset, (char **)0, &fbp); if (err) goto bad; } last_offset = offset; offset += ep_reclen; entryoffsetinblock += ep_reclen; } /* * If we started in the middle of the directory and failed * to find our target, we must check the beginning as well. */ if (numdirpasses == 2) { numdirpasses--; offset = 0; endsearch = start_off; goto searchloop; } /* * If whole directory caching is on (or was originally on) then * the entry may have been found. */ if (*ipp == NULL) { err = ENOENT; if (ufs_negative_cache && (dp->i_nlink > 0)) { dnlc_enter(dvp, namep, DNLC_NO_VNODE); } } if (caching) { dnlc_dir_complete(dcap); caching = 0; } bad: if (err && *ipp) { /* * err and *ipp can both be set if we were attempting to * cache the directory, and we found the entry, then later * while trying to complete the directory cache encountered * a error (eg reading a directory sector). */ VN_RELE(ITOV(*ipp)); *ipp = NULL; } if (fbp) fbrelse(fbp, S_OTHER); rw_exit(&dp->i_rwlock); if (caching) dnlc_dir_purge(dcap); return (err); } /* * Write a new directory entry for DE_CREATE or DE_MKDIR operations. */ int ufs_direnter_cm( struct inode *tdp, /* target directory to make entry in */ char *namep, /* name of entry */ enum de_op op, /* entry operation */ struct vattr *vap, /* attributes if new inode needed */ struct inode **ipp, /* return entered inode here */ struct cred *cr, /* user credentials */ int flags) /* no entry exists */ { struct inode *tip; /* inode of (existing) target file */ char *s; struct slot slot; /* slot info to pass around */ int namlen; /* length of name */ int err; /* error number */ struct inode *nip; /* new inode */ int do_rele_nip = 0; /* release nip */ int noentry = flags & ~IQUIET; int quiet = flags & IQUIET; /* Suppress out of inodes message */ ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) || ((vap->va_type == VCHR) || (vap->va_type == VBLK) || (vap->va_type == VDOOR) || (vap->va_type == VSOCK) || (vap->va_type == VFIFO)))) return (EINVAL); /* don't allow '/' characters in pathname component */ for (s = namep, namlen = 0; *s; s++, namlen++) if (*s == '/') return (EACCES); ASSERT(namlen); /* * If name is "." or ".." then if this is a create look it up * and return EEXIST. */ if (namep[0] == '.' && (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { /* * ufs_dirlook will acquire the i_rwlock */ rw_exit(&tdp->i_rwlock); if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) { rw_enter(&tdp->i_rwlock, RW_WRITER); return (err); } rw_enter(&tdp->i_rwlock, RW_WRITER); return (EEXIST); } /* * If target directory has not been removed, then we can consider * allowing file to be created. */ if (tdp->i_nlink <= 0) { return (ENOENT); } /* * Check accessibility of directory. */ if (((tdp->i_mode & IFMT) != IFDIR) && ((tdp->i_mode & IFMT) != IFATTRDIR)) { return (ENOTDIR); } /* * Execute access is required to search the directory. */ if (err = ufs_iaccess(tdp, IEXEC, cr)) { return (err); } /* * Search for the entry. Return VN_HELD tip if found. */ tip = NULL; slot.fbp = NULL; slot.status = NONE; rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); rw_enter(&tdp->i_contents, RW_WRITER); err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry); if (err) goto out; if (tip) { ASSERT(!noentry); *ipp = tip; err = EEXIST; } else { /* * The entry does not exist. Check write permission in * directory to see if entry can be created. */ if (err = ufs_iaccess(tdp, IWRITE, cr)) goto out; /* * Make new inode and directory entry. */ tdp->i_flag |= quiet; if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) { if (nip != NULL) do_rele_nip = 1; goto out; } if (err = ufs_diraddentry(tdp, namep, op, namlen, &slot, nip, NULL, cr)) { /* * Unmake the inode we just made. */ rw_enter(&nip->i_contents, RW_WRITER); if (((nip->i_mode & IFMT) == IFDIR) || ((nip->i_mode & IFMT) == IFATTRDIR)) { tdp->i_nlink--; ufs_setreclaim(tdp); tdp->i_flag |= ICHG; tdp->i_seq++; TRANS_INODE(tdp->i_ufsvfs, tdp); ITIMES_NOLOCK(tdp); } nip->i_nlink = 0; ufs_setreclaim(nip); TRANS_INODE(nip->i_ufsvfs, nip); nip->i_flag |= ICHG; nip->i_seq++; ITIMES_NOLOCK(nip); rw_exit(&nip->i_contents); do_rele_nip = 1; } else { *ipp = nip; } } out: if (slot.fbp) fbrelse(slot.fbp, S_OTHER); tdp->i_flag &= ~quiet; rw_exit(&tdp->i_contents); /* * Drop vfs_dqrwlock before calling VN_RELE() on nip to * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. */ rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); if (do_rele_nip) { VN_RELE(ITOV(nip)); } return (err); } /* * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations. * If tvpp is non-null, return with the pointer to the target vnode. */ int ufs_direnter_lr( struct inode *tdp, /* target directory to make entry in */ char *namep, /* name of entry */ enum de_op op, /* entry operation */ struct inode *sdp, /* source inode parent if rename */ struct inode *sip, /* source inode */ struct cred *cr, /* user credentials */ vnode_t **tvpp) /* Return: (held) vnode of (existing) target */ { struct inode *tip; /* inode of (existing) target file */ char *s; struct slot slot; /* slot info to pass around */ int namlen; /* length of name */ int err; /* error number */ /* don't allow '/' characters in pathname component */ for (s = namep, namlen = 0; *s; s++, namlen++) if (*s == '/') return (EACCES); ASSERT(namlen); ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); /* * If name is "." or ".." then if this is a create look it up * and return EEXIST. Rename or link TO "." or ".." is forbidden. */ if (namep[0] == '.' && (namlen == 1 || (namlen == 2 && namep[1] == '.'))) { if (op == DE_RENAME) { return (EINVAL); /* *SIGH* should be ENOTEMPTY */ } return (EEXIST); } /* * For link and rename lock the source entry and check the link count * to see if it has been removed while it was unlocked. If not, we * increment the link count and force the inode to disk to make sure * that it is there before any directory entry that points to it. * * In the case of a symbolic link, we are dealing with a new inode * which does not yet have any links. We've created it with a link * count of 1, and we don't want to increment it since this will be * its first link. * * We are about to push the inode to disk. We make sure * that the inode's data blocks are flushed first so the * inode and it's data blocks are always in sync. This * adds some robustness in in the event of a power failure * or panic where sync fails. If we panic before the * inode is updated, then the inode still refers to the * old data blocks (or none for a new file). If we panic * after the inode is updated, then the inode refers to * the new data blocks. * * We do this before grabbing the i_contents lock because * ufs_syncip() will want that lock. We could do the data * syncing after the removal checks, but upon return from * the data sync we would have to repeat the removal * checks. */ if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) { return (err); } rw_enter(&sip->i_contents, RW_WRITER); if (sip->i_nlink <= 0) { rw_exit(&sip->i_contents); return (ENOENT); } if (sip->i_nlink == MAXLINK) { rw_exit(&sip->i_contents); return (EMLINK); } /* * Sync the indirect blocks associated with the file * for the same reasons as described above. Since this * call wants the i_contents lock held for it we can do * this here with no extra work. */ if (err = ufs_sync_indir(sip)) { rw_exit(&sip->i_contents); return (err); } if (op != DE_SYMLINK) sip->i_nlink++; TRANS_INODE(sip->i_ufsvfs, sip); sip->i_flag |= ICHG; sip->i_seq++; ufs_iupdat(sip, I_SYNC); rw_exit(&sip->i_contents); /* * If target directory has not been removed, then we can consider * allowing file to be created. */ if (tdp->i_nlink <= 0) { err = ENOENT; goto out2; } /* * Check accessibility of directory. */ if (((tdp->i_mode & IFMT) != IFDIR) && (tdp->i_mode & IFMT) != IFATTRDIR) { err = ENOTDIR; goto out2; } /* * Execute access is required to search the directory. */ if (err = ufs_iaccess(tdp, IEXEC, cr)) { goto out2; } /* * Search for the entry. Return VN_HELD tip if found. */ tip = NULL; slot.status = NONE; slot.fbp = NULL; rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER); rw_enter(&tdp->i_contents, RW_WRITER); err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0); if (err) goto out; if (tip) { switch (op) { case DE_RENAME: err = ufs_dirrename(sdp, sip, tdp, namep, tip, &slot, cr); break; case DE_LINK: case DE_SYMLINK: /* * Can't link to an existing file. */ err = EEXIST; break; default: break; } } else { /* * The entry does not exist. Check write permission in * directory to see if entry can be created. */ if (err = ufs_iaccess(tdp, IWRITE, cr)) goto out; err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp, cr); } out: if (slot.fbp) fbrelse(slot.fbp, S_OTHER); rw_exit(&tdp->i_contents); /* * Drop vfs_dqrwlock before calling VN_RELE() on tip to * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. */ rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock); /* * If we renamed a file over the top of an existing file, * or linked a file to an existing file (or tried to), * then set *tvpp to the target vnode, if tvpp is non-null * otherwise, release and delete (or just release) the inode. * * N.B., by returning the target's vnode pointer to the caller, * that caller becomes responsible for doing the VN_RELE. */ if (tip) { if ((err == 0) && (tvpp != NULL)) { *tvpp = ITOV(tip); } else { VN_RELE(ITOV(tip)); } } out2: if (err) { /* * Undo bumped link count. */ if (op != DE_SYMLINK) { rw_enter(&sip->i_contents, RW_WRITER); sip->i_nlink--; ufs_setreclaim(sip); TRANS_INODE(sip->i_ufsvfs, sip); sip->i_flag |= ICHG; sip->i_seq++; ITIMES_NOLOCK(sip); rw_exit(&sip->i_contents); } } return (err); } /* * Check for the existence of a name in a directory (unless noentry * is set) , or else of an empty * slot in which an entry may be made. If the requested name is found, * then on return *ipp points at the inode and *offp contains * its offset in the directory. If the name is not found, then *ipp * will be NULL and *slotp will contain information about a directory slot in * which an entry may be made (either an empty slot, or the first position * past the end of the directory). * The target directory inode (tdp) is supplied write locked (i_rwlock). * * This may not be used on "." or "..", but aliases of "." are ok. */ int ufs_dircheckforname( struct inode *tdp, /* inode of directory being checked */ char *namep, /* name we're checking for */ int namlen, /* length of name, excluding null */ struct slot *slotp, /* slot structure */ struct inode **ipp, /* return inode if we find one */ struct cred *cr, int noentry) /* noentry - just look for space */ { uint64_t handle; struct fbuf *fbp; /* pointer to directory block */ struct direct *ep; /* directory entry */ struct direct *nep; /* next directory entry */ dcanchor_t *dcap; vnode_t *dvp; /* directory vnode ptr */ off_t dirsize; /* size of the directory */ off_t offset; /* offset in the directory */ off_t last_offset; /* last offset */ off_t enduseful; /* pointer past last used dir slot */ int entryoffsetinblk; /* offset of ep in fbp's buffer */ int i; /* length of mangled entry */ int needed; int err; int first; int caching; int stat; ino_t ep_ino; slotstat_t initstat = slotp->status; ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); ASSERT(RW_WRITE_HELD(&tdp->i_contents)); ASSERT(*ipp == NULL); fbp = NULL; /* * First check if there is a complete cache of the directory. */ dvp = ITOV(tdp); dcap = &tdp->i_danchor; if (noentry) { /* * We know from the 1st level dnlc cache that the entry * doesn't exist, so don't bother searching the directory * cache, but just look for space (possibly in the directory * cache). */ stat = DNOENT; } else { stat = dnlc_dir_lookup(dcap, namep, &handle); } switch (stat) { case DFOUND: ep_ino = (ino_t)H_TO_INO(handle); if (tdp->i_number == ep_ino) { *ipp = tdp; /* we want ourself, ie "." */ VN_HOLD(dvp); } else { err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr); if (err) return (err); } offset = H_TO_OFF(handle); first = 0; if (offset & 1) { /* This is the first entry in the block */ first = 1; offset -= 1; ASSERT((offset & (DIRBLKSIZ - 1)) == 0); } err = blkatoff(tdp, offset, (char **)&ep, &fbp); if (err) { VN_RELE(ITOV(*ipp)); *ipp = NULL; return (err); } /* * Check the validity of the entry. * If it's bad, then throw away the cache and * continue without it. The dirmangled() routine * will then be called upon it. */ if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { VN_RELE(ITOV(*ipp)); *ipp = NULL; dnlc_dir_purge(dcap); break; } /* * Remember the returned offset is the offset of the * preceding record (unless this is the 1st record * in the DIRBLKSIZ sized block (disk sector)), then it's * offset + 1. Note, no real offsets are on odd boundaries. */ if (first) { ASSERT((offset & (DIRBLKSIZ - 1)) == 0); slotp->offset = offset; slotp->size = 0; slotp->ep = ep; } else { /* get the next entry */ nep = (struct direct *)((char *)ep + ep->d_reclen); /* * Check the validity of this entry as well * If it's bad, then throw away the cache and * continue without it. The dirmangled() routine * will then be called upon it. */ if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || (nep->d_ino != ep_ino)) { VN_RELE(ITOV(*ipp)); *ipp = NULL; dnlc_dir_purge(dcap); break; } slotp->offset = offset + ep->d_reclen; slotp->size = ep->d_reclen; slotp->ep = nep; } slotp->status = EXIST; slotp->fbp = fbp; slotp->endoff = 0; slotp->cached = 1; dnlc_update(dvp, namep, ITOV(*ipp)); return (0); case DNOENT: /* * The caller gets to set the initial slot status to * indicate whether it's interested in getting a * empty slot. For example, the status can be set * to FOUND when an entry is being deleted. */ ASSERT(slotp->fbp == NULL); if (slotp->status == FOUND) { return (0); } switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen), &handle)) { case DFOUND: offset = (off_t)handle; err = blkatoff(tdp, offset, (char **)&ep, &fbp); if (err) { dnlc_dir_purge(dcap); ASSERT(*ipp == NULL); return (err); } /* * Check the validity of the entry. * If it's bad, then throw away the cache and * continue without it. The dirmangled() routine * will then be called upon it. */ if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) { dnlc_dir_purge(dcap); break; } /* * Remember the returned offset is the offset of the * containing record. */ slotp->status = FOUND; slotp->ep = ep; slotp->offset = offset; slotp->fbp = fbp; slotp->size = ep->d_reclen; /* * Set end offset to 0. Truncation is handled * because the dnlc cache will blow away the * cached directory when an entry is removed * that drops the entries left to less than half * the minumum number (dnlc_min_dir_cache). */ slotp->endoff = 0; slotp->cached = 1; return (0); case DNOENT: slotp->status = NONE; slotp->offset = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t); slotp->size = DIRBLKSIZ; slotp->endoff = 0; slotp->cached = 1; return (0); default: break; } break; } slotp->cached = 0; caching = NULL; if (!noentry && tdp->i_size >= ufs_min_dir_cache) { /* * if the directory caching disable time has expired * enable caching again. */ if (tdp->i_cachedir == CD_DISABLED_NOMEM && gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) { ufs_dc_disable_at = 0; tdp->i_cachedir = CD_ENABLED; } /* * Attempt to cache any directories greater than the tunable * ufs_min_cache_dir. If it fails due to memory shortage * (DNOMEM), disable caching for this directory and record * the system time. Any attempt after the disable time has * expired will enable the caching again. */ if (tdp->i_cachedir == CD_ENABLED) { switch (dnlc_dir_start(dcap, tdp->i_size >> AV_DIRECT_SHIFT)) { case DNOMEM: tdp->i_cachedir = CD_DISABLED_NOMEM; ufs_dc_disable_at = gethrtime(); break; case DTOOBIG: tdp->i_cachedir = CD_DISABLED_TOOBIG; break; case DOK: caching = 1; break; default: break; } } } /* * No point in using i_diroff since we must search whole directory */ dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t); enduseful = 0; offset = last_offset = 0; entryoffsetinblk = 0; needed = (int)LDIRSIZ(namlen); while (offset < dirsize) { /* * If offset is on a block boundary, * read the next directory block. * Release previous if it exists. */ if (blkoff(tdp->i_fs, offset) == 0) { if (fbp != NULL) fbrelse(fbp, S_OTHER); err = blkatoff(tdp, offset, (char **)0, &fbp); if (err) { ASSERT(*ipp == NULL); if (caching) { dnlc_dir_purge(dcap); } return (err); } entryoffsetinblk = 0; } /* * If still looking for a slot, and at a DIRBLKSIZ * boundary, have to start looking for free space * again. */ if (slotp->status == NONE && (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) { slotp->offset = -1; } /* * If the next entry is a zero length record or if the * record length is invalid, then skip to the next * directory block. Complete validation checks are * done if the record length is invalid. * * Full validation checks are slow so they are disabled * by default. Complete checks can be run by patching * "dirchk" to be true. * * We do not have to check the validity of * entryoffsetinblk here because it starts out as zero * and is only incremented by d_reclen values that we * validate here. */ ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk); if (ep->d_reclen == 0 || (dirchk || (ep->d_reclen & 0x3)) && dirmangled(tdp, ep, entryoffsetinblk, offset)) { i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1)); offset += i; entryoffsetinblk += i; if (caching) { dnlc_dir_purge(dcap); caching = 0; } continue; } /* * Add named entries and free space into the directory cache */ if (caching) { ushort_t extra; off_t off2; if (ep->d_ino == 0) { extra = ep->d_reclen; if (offset & (DIRBLKSIZ - 1)) { dnlc_dir_purge(dcap); caching = 0; } } else { /* * entries hold the previous offset if * not the 1st one */ if (offset & (DIRBLKSIZ - 1)) { off2 = last_offset; } else { off2 = offset + 1; } caching = (dnlc_dir_add_entry(dcap, ep->d_name, INO_OFF_TO_H(ep->d_ino, off2)) == DOK); extra = ep->d_reclen - DIRSIZ(ep); } if (caching && (extra >= LDIRSIZ(1))) { caching = (dnlc_dir_add_space(dcap, extra, (uint64_t)offset) == DOK); } } /* * If an appropriate sized slot has not yet been found, * check to see if one is available. */ if ((slotp->status != FOUND) && (slotp->status != EXIST)) { int size = ep->d_reclen; if (ep->d_ino != 0) size -= DIRSIZ(ep); if (size > 0) { if (size >= needed) { slotp->offset = offset; slotp->size = ep->d_reclen; if (noentry) { slotp->ep = ep; slotp->fbp = fbp; slotp->status = FOUND; slotp->endoff = 0; return (0); } slotp->status = FOUND; } else if (slotp->status == NONE) { if (slotp->offset == -1) slotp->offset = offset; } } } /* * Check for a name match. */ if (ep->d_ino && ep->d_namlen == namlen && *namep == *ep->d_name && /* fast chk 1st char */ bcmp(namep, ep->d_name, namlen) == 0) { tdp->i_diroff = offset; if (tdp->i_number == ep->d_ino) { *ipp = tdp; /* we want ourself, ie "." */ VN_HOLD(dvp); } else { err = ufs_iget_alloced(tdp->i_vfs, (ino_t)ep->d_ino, ipp, cr); if (err) { fbrelse(fbp, S_OTHER); if (caching) dnlc_dir_purge(dcap); return (err); } } slotp->status = EXIST; slotp->offset = offset; slotp->size = (int)(offset - last_offset); slotp->fbp = fbp; slotp->ep = ep; slotp->endoff = 0; if (caching) dnlc_dir_purge(dcap); return (0); } last_offset = offset; offset += ep->d_reclen; entryoffsetinblk += ep->d_reclen; if (ep->d_ino) enduseful = offset; } if (fbp) { fbrelse(fbp, S_OTHER); } if (caching) { dnlc_dir_complete(dcap); slotp->cached = 1; if (slotp->status == FOUND) { if (initstat == FOUND) { return (0); } (void) dnlc_dir_rem_space_by_handle(dcap, slotp->offset); slotp->endoff = 0; return (0); } } if (slotp->status == NONE) { /* * We didn't find a slot; the new directory entry should be put * at the end of the directory. Return an indication of where * this is, and set "endoff" to zero; since we're going to have * to extend the directory, we're certainly not going to * truncate it. */ slotp->offset = dirsize; slotp->size = DIRBLKSIZ; slotp->endoff = 0; } else { /* * We found a slot, and will return an indication of where that * slot is, as any new directory entry will be put there. * Since that slot will become a useful entry, if the last * useful entry we found was before this one, update the offset * of the last useful entry. */ if (enduseful < slotp->offset + slotp->size) enduseful = slotp->offset + slotp->size; slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t); } *ipp = NULL; return (0); } uint64_t ufs_dirrename_retry_cnt; /* * Rename the entry in the directory tdp so that it points to * sip instead of tip. */ static int ufs_dirrename( struct inode *sdp, /* parent directory of source */ struct inode *sip, /* source inode */ struct inode *tdp, /* parent directory of target */ char *namep, /* entry we are trying to change */ struct inode *tip, /* target inode */ struct slot *slotp, /* slot for entry */ struct cred *cr) /* credentials */ { vnode_t *tdvp; off_t offset; int err; int doingdirectory; ASSERT(sdp->i_ufsvfs != NULL); ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); ASSERT(RW_WRITE_HELD(&tdp->i_contents)); /* * Short circuit rename of something to itself. */ if (sip->i_number == tip->i_number) { return (ESAME); /* special KLUDGE error code */ } /* * We're locking 2 peer level locks, so must use tryenter * on the 2nd to avoid deadlocks that would occur * if we renamed a->b and b->a concurrently. */ retry: rw_enter(&tip->i_contents, RW_WRITER); if (!rw_tryenter(&sip->i_contents, RW_READER)) { /* * drop tip and wait (sleep) until we stand a chance * of holding sip */ rw_exit(&tip->i_contents); rw_enter(&sip->i_contents, RW_READER); /* * Reverse the lock grabs in case we have heavy * contention on the 2nd lock. */ if (!rw_tryenter(&tip->i_contents, RW_WRITER)) { ufs_dirrename_retry_cnt++; rw_exit(&sip->i_contents); goto retry; } } /* * Check that everything is on the same filesystem. */ if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) || (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) { err = EXDEV; /* XXX archaic */ goto out; } /* * Must have write permission to rewrite target entry. * Perform additional checks for sticky directories. */ if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 || (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0) goto out; /* * Ensure source and target are compatible (both directories * or both not directories). If target is a directory it must * be empty and have no links to it; in addition it must not * be a mount point, and both the source and target must be * writable. */ doingdirectory = (((sip->i_mode & IFMT) == IFDIR) || ((sip->i_mode & IFMT) == IFATTRDIR)); if (((tip->i_mode & IFMT) == IFDIR) || ((tip->i_mode & IFMT) == IFATTRDIR)) { if (!doingdirectory) { err = EISDIR; goto out; } /* * vn_vfswlock will prevent mounts from using the directory * until we are done. */ if (vn_vfswlock(ITOV(tip))) { err = EBUSY; goto out; } if (vn_mountedvfs(ITOV(tip)) != NULL) { vn_vfsunlock(ITOV(tip)); err = EBUSY; goto out; } if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) { vn_vfsunlock(ITOV(tip)); err = EEXIST; /* SIGH should be ENOTEMPTY */ goto out; } } else if (doingdirectory) { err = ENOTDIR; goto out; } /* * Rewrite the inode pointer for target name entry * from the target inode (ip) to the source inode (sip). * This prevents the target entry from disappearing * during a crash. Mark the directory inode to reflect the changes. */ tdvp = ITOV(tdp); slotp->ep->d_ino = (int32_t)sip->i_number; dnlc_update(tdvp, namep, ITOV(sip)); if (slotp->size) { offset = slotp->offset - slotp->size; } else { offset = slotp->offset + 1; } if (slotp->cached) { (void) dnlc_dir_update(&tdp->i_danchor, namep, INO_OFF_TO_H(slotp->ep->d_ino, offset)); } err = TRANS_DIR(tdp, slotp->offset); if (err) fbrelse(slotp->fbp, S_OTHER); else err = ufs_fbwrite(slotp->fbp, tdp); slotp->fbp = NULL; if (err) { if (doingdirectory) vn_vfsunlock(ITOV(tip)); goto out; } TRANS_INODE(tdp->i_ufsvfs, tdp); tdp->i_flag |= IUPD|ICHG; tdp->i_seq++; ITIMES_NOLOCK(tdp); /* * Decrement the link count of the target inode. * Fix the ".." entry in sip to point to dp. * This is done after the new entry is on the disk. */ tip->i_nlink--; TRANS_INODE(tip->i_ufsvfs, tip); tip->i_flag |= ICHG; tip->i_seq++; ITIMES_NOLOCK(tip); if (doingdirectory) { /* * The entry for tip no longer exists so I can unlock the * vfslock. */ vn_vfsunlock(ITOV(tip)); /* * Decrement target link count once more if it was a directory. */ if (--tip->i_nlink != 0) { err = ufs_fault(ITOV(tip), "ufs_dirrename: target directory link count != 0 (%s)", tip->i_fs->fs_fsmnt); rw_exit(&tip->i_contents); return (err); } TRANS_INODE(tip->i_ufsvfs, tip); ufs_setreclaim(tip); /* * Renaming a directory with the parent different * requires that ".." be rewritten. The window is * still there for ".." to be inconsistent, but this * is unavoidable, and a lot shorter than when it was * done in a user process. We decrement the link * count in the new parent as appropriate to reflect * the just-removed target. If the parent is the * same, this is appropriate since the original * directory is going away. If the new parent is * different, ufs_dirfixdotdot() will bump the link count * back. */ tdp->i_nlink--; ufs_setreclaim(tdp); TRANS_INODE(tdp->i_ufsvfs, tdp); tdp->i_flag |= ICHG; tdp->i_seq++; ITIMES_NOLOCK(tdp); if (sdp != tdp) { rw_exit(&tip->i_contents); rw_exit(&sip->i_contents); err = ufs_dirfixdotdot(sip, sdp, tdp); return (err); } } else ufs_setreclaim(tip); out: rw_exit(&tip->i_contents); rw_exit(&sip->i_contents); return (err); } /* * Fix the ".." entry of the child directory so that it points * to the new parent directory instead of the old one. Routine * assumes that dp is a directory and that all the inodes are on * the same file system. */ static int ufs_dirfixdotdot( struct inode *dp, /* child directory */ struct inode *opdp, /* old parent directory */ struct inode *npdp) /* new parent directory */ { struct fbuf *fbp; struct dirtemplate *dirp; vnode_t *dvp; int err; ASSERT(RW_WRITE_HELD(&npdp->i_rwlock)); ASSERT(RW_WRITE_HELD(&npdp->i_contents)); /* * We hold the child directory's i_contents lock before calling * blkatoff so that we honor correct locking protocol which is * i_contents lock and then page lock. (blkatoff will call * ufs_getpage where we want the page lock) * We hold the child directory's i_rwlock before i_contents (as * per the locking protocol) since we are modifying the ".." entry * of the child directory. * We hold the i_rwlock and i_contents lock until we record * this directory delta to the log (via ufs_trans_dir) and have * done fbrelse. */ rw_enter(&dp->i_rwlock, RW_WRITER); rw_enter(&dp->i_contents, RW_WRITER); err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp); if (err) goto bad; if (dp->i_nlink <= 0 || dp->i_size < sizeof (struct dirtemplate)) { err = ENOENT; goto bad; } if (dirp->dotdot_namlen != 2 || dirp->dotdot_name[0] != '.' || dirp->dotdot_name[1] != '.') { /* Sanity check. */ dirbad(dp, "mangled .. entry", (off_t)0); err = ENOTDIR; goto bad; } /* * Increment the link count in the new parent inode and force it out. */ if (npdp->i_nlink == MAXLINK) { err = EMLINK; goto bad; } npdp->i_nlink++; TRANS_INODE(npdp->i_ufsvfs, npdp); npdp->i_flag |= ICHG; npdp->i_seq++; ufs_iupdat(npdp, I_SYNC); /* * Rewrite the child ".." entry and force it out. */ dvp = ITOV(dp); dirp->dotdot_ino = (uint32_t)npdp->i_number; dnlc_update(dvp, "..", ITOV(npdp)); (void) dnlc_dir_update(&dp->i_danchor, "..", INO_OFF_TO_H(dirp->dotdot_ino, 0)); err = TRANS_DIR(dp, 0); if (err) fbrelse(fbp, S_OTHER); else err = ufs_fbwrite(fbp, dp); fbp = NULL; if (err) goto bad; rw_exit(&dp->i_contents); rw_exit(&dp->i_rwlock); /* * Decrement the link count of the old parent inode and force it out. */ ASSERT(opdp); rw_enter(&opdp->i_contents, RW_WRITER); ASSERT(opdp->i_nlink > 0); opdp->i_nlink--; ufs_setreclaim(opdp); TRANS_INODE(opdp->i_ufsvfs, opdp); opdp->i_flag |= ICHG; opdp->i_seq++; ufs_iupdat(opdp, I_SYNC); rw_exit(&opdp->i_contents); return (0); bad: if (fbp) fbrelse(fbp, S_OTHER); rw_exit(&dp->i_contents); rw_exit(&dp->i_rwlock); return (err); } /* * Enter the file sip in the directory tdp with name namep. */ static int ufs_diraddentry( struct inode *tdp, char *namep, enum de_op op, int namlen, struct slot *slotp, struct inode *sip, struct inode *sdp, struct cred *cr) { struct direct *ep, *nep; vnode_t *tdvp; dcanchor_t *dcap = &tdp->i_danchor; off_t offset; int err; ushort_t extra; ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); ASSERT(RW_WRITE_HELD(&tdp->i_contents)); /* * Prepare a new entry. If the caller has not supplied an * existing inode, make a new one. */ err = dirprepareentry(tdp, slotp, cr); if (err) { if (slotp->fbp) { fbrelse(slotp->fbp, S_OTHER); slotp->fbp = NULL; } return (err); } /* * Check inode to be linked to see if it is in the * same filesystem. */ if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) { err = EXDEV; goto bad; } /* * If renaming a directory then fix up the ".." entry in the * directory to point to the new parent. */ if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) || ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) { err = ufs_dirfixdotdot(sip, sdp, tdp); if (err) goto bad; } /* * Fill in entry data. */ ep = slotp->ep; ep->d_namlen = (ushort_t)namlen; (void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3)); ep->d_ino = (uint32_t)sip->i_number; tdvp = ITOV(tdp); dnlc_update(tdvp, namep, ITOV(sip)); /* * Note the offset supplied for any named entry is * the offset of the previous one, unless it's the 1st. * slotp->size is used to pass the length to * the previous entry. */ if (slotp->size) { offset = slotp->offset - slotp->size; } else { offset = slotp->offset + 1; } if (slotp->cached) { /* * Add back any usable unused space to the dnlc directory * cache. */ extra = ep->d_reclen - DIRSIZ(ep); if (extra >= LDIRSIZ(1)) { (void) dnlc_dir_add_space(dcap, extra, (uint64_t)slotp->offset); } (void) dnlc_dir_add_entry(dcap, namep, INO_OFF_TO_H(ep->d_ino, offset)); /* adjust the previous offset of the next entry */ nep = (struct direct *)((char *)ep + ep->d_reclen); if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { /* * Not a new block. * * Check the validity of the next entry. * If it's bad, then throw away the cache, and * continue as before directory caching. */ if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || dnlc_dir_update(dcap, nep->d_name, INO_OFF_TO_H(nep->d_ino, slotp->offset)) == DNOENT) { dnlc_dir_purge(dcap); slotp->cached = 0; } } } /* * Write out the directory block. */ err = TRANS_DIR(tdp, slotp->offset); if (err) fbrelse(slotp->fbp, S_OTHER); else err = ufs_fbwrite(slotp->fbp, tdp); slotp->fbp = NULL; /* * If this is a rename of a directory, then we have already * fixed the ".." entry to refer to the new parent. If err * is true at this point, we have failed to update the new * parent to refer to the renamed directory. * XXX - we need to unwind the ".." fix. */ if (err) return (err); /* * Mark the directory inode to reflect the changes. * Truncate the directory to chop off blocks of empty entries. */ TRANS_INODE(tdp->i_ufsvfs, tdp); tdp->i_flag |= IUPD|ICHG; tdp->i_seq++; tdp->i_diroff = 0; ITIMES_NOLOCK(tdp); /* * If the directory grew then dirprepareentry() will have * set IATTCHG in tdp->i_flag, then the directory inode must * be flushed out. This is because if fsync() is used later * the directory size must be correct, otherwise a crash would * cause fsck to move the file to lost+found. Also because later * a file may be linked in more than one directory, then there * is no way to flush the original directory. So it must be * flushed out on creation. See bug 4293809. */ if (tdp->i_flag & IATTCHG) { ufs_iupdat(tdp, I_SYNC); } if (slotp->endoff && (slotp->endoff < tdp->i_size)) { if (!TRANS_ISTRANS(tdp->i_ufsvfs)) { (void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0, cr); } } return (0); bad: if (slotp->cached) { dnlc_dir_purge(dcap); fbrelse(slotp->fbp, S_OTHER); slotp->cached = 0; slotp->fbp = NULL; return (err); } /* * Clear out entry prepared by dirprepareent. */ slotp->ep->d_ino = 0; slotp->ep->d_namlen = 0; /* * Don't touch err so we don't clobber the real error that got us here. */ if (TRANS_DIR(tdp, slotp->offset)) fbrelse(slotp->fbp, S_OTHER); else (void) ufs_fbwrite(slotp->fbp, tdp); slotp->fbp = NULL; return (err); } /* * Prepare a directory slot to receive an entry. */ static int dirprepareentry( struct inode *dp, /* directory we are working in */ struct slot *slotp, /* available slot info */ struct cred *cr) { struct direct *ep, *nep; off_t entryend; int err; slotstat_t status = slotp->status; ushort_t dsize; ASSERT((status == NONE) || (status == FOUND)); ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); ASSERT(RW_WRITE_HELD(&dp->i_contents)); /* * If we didn't find a slot, then indicate that the * new slot belongs at the end of the directory. * If we found a slot, then the new entry can be * put at slotp->offset. */ entryend = slotp->offset + slotp->size; if (status == NONE) { ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0); if (DIRBLKSIZ > dp->i_fs->fs_fsize) { err = ufs_fault(ITOV(dp), "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d" " > dp->i_fs->fs_fsize: %d (%s)", DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt); return (err); } /* * Allocate the new block. */ err = BMAPALLOC(dp, (u_offset_t)slotp->offset, (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr); if (err) { return (err); } dp->i_size = entryend; TRANS_INODE(dp->i_ufsvfs, dp); dp->i_flag |= IUPD|ICHG|IATTCHG; dp->i_seq++; ITIMES_NOLOCK(dp); } else if (entryend > dp->i_size) { /* * Adjust directory size, if needed. This should never * push the size past a new multiple of DIRBLKSIZ. * This is an artifact of the old (4.2BSD) way of initializing * directory sizes to be less than DIRBLKSIZ. */ dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t); TRANS_INODE(dp->i_ufsvfs, dp); dp->i_flag |= IUPD|ICHG|IATTCHG; dp->i_seq++; ITIMES_NOLOCK(dp); } /* * Get the block containing the space for the new directory entry. */ if (slotp->fbp == NULL) { err = blkatoff(dp, slotp->offset, (char **)&slotp->ep, &slotp->fbp); if (err) { return (err); } } ep = slotp->ep; switch (status) { case NONE: /* * No space in the directory. slotp->offset will be on a * directory block boundary and we will write the new entry * into a fresh block. */ ep->d_reclen = DIRBLKSIZ; slotp->size = 0; /* length of previous entry */ break; case FOUND: /* * An entry of the required size has been found. Use it. */ if (ep->d_ino == 0) { /* this is the 1st record in a block */ slotp->size = 0; /* length of previous entry */ } else { dsize = DIRSIZ(ep); nep = (struct direct *)((char *)ep + dsize); nep->d_reclen = ep->d_reclen - dsize; ep->d_reclen = dsize; slotp->ep = nep; slotp->offset += dsize; slotp->size = dsize; /* length of previous entry */ } break; default: break; } return (0); } /* * Allocate and initialize a new inode that will go into directory tdp. * This routine is called from ufs_symlink(), as well as within this file. */ int ufs_dirmakeinode( struct inode *tdp, struct inode **ipp, struct vattr *vap, enum de_op op, struct cred *cr) { struct inode *ip; enum vtype type; int imode; /* mode and format as in inode */ ino_t ipref; int err; timestruc_t now; ASSERT(vap != NULL); ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR || op == DE_SYMLINK); ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); ASSERT(RW_WRITE_HELD(&tdp->i_rwlock)); ASSERT(RW_WRITE_HELD(&tdp->i_contents)); /* * Allocate a new inode. */ type = vap->va_type; if (type == VDIR) { ipref = dirpref(tdp); } else { ipref = tdp->i_number; } if (op == DE_ATTRDIR) imode = vap->va_mode; else imode = MAKEIMODE(type, vap->va_mode); *ipp = NULL; err = ufs_ialloc(tdp, ipref, imode, &ip, cr); if (err) return (err); /* * We don't need to grab vfs_dqrwlock here because it is held * in ufs_direnter_*() above us. */ ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock)); rw_enter(&ip->i_contents, RW_WRITER); if (ip->i_dquot != NULL) { err = ufs_fault(ITOV(ip), "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)", tdp->i_fs->fs_fsmnt); rw_exit(&ip->i_contents); return (err); } *ipp = ip; ip->i_mode = (o_mode_t)imode; if (type == VBLK || type == VCHR) { dev_t d = vap->va_rdev; dev32_t dev32; /* * Don't allow a special file to be created with a * dev_t that cannot be represented by this filesystem * format on disk. */ if (!cmpldev(&dev32, d)) { err = EOVERFLOW; goto fail; } ITOV(ip)->v_rdev = ip->i_rdev = d; if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) { ip->i_ordev = dev32; /* can't use old format */ } else { ip->i_ordev = cmpdev(d); } } ITOV(ip)->v_type = type; ufs_reset_vnode(ip->i_vnode); if (type == VDIR) { ip->i_nlink = 2; /* anticipating a call to dirmakedirect */ } else { ip->i_nlink = 1; } if (op == DE_ATTRDIR) { ip->i_uid = vap->va_uid; ip->i_gid = vap->va_gid; } else ip->i_uid = crgetuid(cr); /* * To determine the group-id of the created file: * 1) If the gid is set in the attribute list (non-Sun & pre-4.0 * clients are not likely to set the gid), then use it if * the process is privileged, belongs to the target group, * or the group is the same as the parent directory. * 2) If the filesystem was not mounted with the Old-BSD-compatible * GRPID option, and the directory's set-gid bit is clear, * then use the process's gid. * 3) Otherwise, set the group-id to the gid of the parent directory. */ if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) && ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) || secpolicy_vnode_create_gid(cr) == 0)) { /* * XXX - is this only the case when a 4.0 NFS client, or a * client derived from that code, makes a call over the wire? */ ip->i_gid = vap->va_gid; } else ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr); /* * For SunOS 5.0->5.4, the lines below read: * * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid; * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid; * * where MAXUID was set to 60002. See notes on this in ufs_inode.c */ ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ? UID_LONG : ip->i_uid; ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ? GID_LONG : ip->i_gid; /* * If we're creating a directory, and the parent directory has the * set-GID bit set, set it on the new directory. * Otherwise, if the user is neither privileged nor a member of the * file's new group, clear the file's set-GID bit. */ if ((tdp->i_mode & ISGID) && (type == VDIR)) ip->i_mode |= ISGID; else { if ((ip->i_mode & ISGID) && secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0) ip->i_mode &= ~ISGID; } if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { err = EOVERFLOW; goto fail; } /* * Extended attribute directories are not subject to quotas. */ if (op != DE_ATTRDIR) ip->i_dquot = getinoquota(ip); else ip->i_dquot = NULL; if (op == DE_MKDIR || op == DE_ATTRDIR) { err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr); if (err) goto fail; } /* * generate the shadow inode and attach it to the new object */ ASSERT((tdp->i_shadow && tdp->i_ufs_acl) || (!tdp->i_shadow && !tdp->i_ufs_acl)); if (tdp->i_shadow && tdp->i_ufs_acl && (((tdp->i_mode & IFMT) == IFDIR) || ((tdp->i_mode & IFMT) == IFATTRDIR))) { err = ufs_si_inherit(ip, tdp, ip->i_mode, cr); if (err) { if (op == DE_MKDIR) { /* * clean up parent directory * * tdp->i_contents already locked from * ufs_direnter_*() */ tdp->i_nlink--; TRANS_INODE(tdp->i_ufsvfs, tdp); tdp->i_flag |= ICHG; tdp->i_seq++; ufs_iupdat(tdp, I_SYNC); } goto fail; } } /* * If the passed in attributes contain atime and/or mtime * settings, then use them instead of using the current * high resolution time. */ if (vap->va_mask & (AT_MTIME|AT_ATIME)) { if (vap->va_mask & AT_ATIME) { ip->i_atime.tv_sec = vap->va_atime.tv_sec; ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000; ip->i_flag &= ~IACC; } else ip->i_flag |= IACC; if (vap->va_mask & AT_MTIME) { ip->i_mtime.tv_sec = vap->va_mtime.tv_sec; ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000; gethrestime(&now); if (now.tv_sec > TIME32_MAX) { /* * In 2038, ctime sticks forever.. */ ip->i_ctime.tv_sec = TIME32_MAX; ip->i_ctime.tv_usec = 0; } else { ip->i_ctime.tv_sec = now.tv_sec; ip->i_ctime.tv_usec = now.tv_nsec / 1000; } ip->i_flag &= ~(IUPD|ICHG); ip->i_flag |= IMODTIME; } else ip->i_flag |= IUPD|ICHG; ip->i_flag |= IMOD; } else ip->i_flag |= IACC|IUPD|ICHG; ip->i_seq++; /* * If this is an attribute tag it as one. */ if ((tdp->i_mode & IFMT) == IFATTRDIR) { ip->i_cflags |= IXATTR; } /* * push inode before it's name appears in a directory */ TRANS_INODE(ip->i_ufsvfs, ip); ufs_iupdat(ip, I_SYNC); rw_exit(&ip->i_contents); return (0); fail: /* Throw away inode we just allocated. */ ip->i_nlink = 0; ufs_setreclaim(ip); TRANS_INODE(ip->i_ufsvfs, ip); ip->i_flag |= ICHG; ip->i_seq++; ITIMES_NOLOCK(ip); rw_exit(&ip->i_contents); return (err); } /* * Write a prototype directory into the empty inode ip, whose parent is dp. */ static int ufs_dirmakedirect( struct inode *ip, /* new directory */ struct inode *dp, /* parent directory */ int attrdir, struct cred *cr) { struct dirtemplate *dirp; struct fbuf *fbp; int err; ASSERT(RW_WRITE_HELD(&ip->i_contents)); ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); ASSERT(RW_WRITE_HELD(&dp->i_contents)); /* * Allocate space for the directory we're creating. */ err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr); if (err) return (err); if (DIRBLKSIZ > dp->i_fs->fs_fsize) { err = ufs_fault(ITOV(dp), "ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)", DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt); return (err); } ip->i_size = DIRBLKSIZ; TRANS_INODE(ip->i_ufsvfs, ip); ip->i_flag |= IUPD|ICHG|IATTCHG; ip->i_seq++; ITIMES_NOLOCK(ip); /* * Update the tdp link count and write out the change. * This reflects the ".." entry we'll soon write. */ if (dp->i_nlink == MAXLINK) return (EMLINK); if (attrdir == 0) dp->i_nlink++; TRANS_INODE(dp->i_ufsvfs, dp); dp->i_flag |= ICHG; dp->i_seq++; ufs_iupdat(dp, I_SYNC); /* * Initialize directory with "." * and ".." from static template. * * Since the parent directory is locked, we don't have to * worry about anything changing when we drop the write * lock on (ip). * */ err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize, S_READ, &fbp); if (err) { goto fail; } dirp = (struct dirtemplate *)fbp->fb_addr; /* * Now initialize the directory we're creating * with the "." and ".." entries. */ *dirp = mastertemplate; /* structure assignment */ dirp->dot_ino = (uint32_t)ip->i_number; dirp->dotdot_ino = (uint32_t)dp->i_number; err = TRANS_DIR(ip, 0); if (err) { fbrelse(fbp, S_OTHER); goto fail; } err = ufs_fbwrite(fbp, ip); if (err) { goto fail; } return (0); fail: if (attrdir == 0) dp->i_nlink--; TRANS_INODE(dp->i_ufsvfs, dp); dp->i_flag |= ICHG; dp->i_seq++; ufs_iupdat(dp, I_SYNC); return (err); } /* * Delete a directory entry. If oip is nonzero the entry is checked * to make sure it still reflects oip. * * If vpp is non-null, return the ptr of the (held) vnode associated with * the removed name. The caller is responsible for doing the VN_RELE(). */ int ufs_dirremove( struct inode *dp, char *namep, struct inode *oip, struct vnode *cdir, enum dr_op op, struct cred *cr, vnode_t **vpp) /* Return (held) vnode ptr of removed file/dir */ { struct direct *ep, *pep, *nep; struct inode *ip; vnode_t *dvp, *vp; struct slot slot; int namlen; int err; int mode; ushort_t extra; namlen = (int)strlen(namep); if (namlen == 0) return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0")); /* * return error when removing . and .. */ if (namep[0] == '.') { if (namlen == 1) return (EINVAL); else if (namlen == 2 && namep[1] == '.') { return (EEXIST); /* SIGH should be ENOTEMPTY */ } } ASSERT(RW_WRITE_HELD(&dp->i_rwlock)); /* * Check accessibility of directory. */ retry: if (((dp->i_mode & IFMT) != IFDIR) && ((dp->i_mode & IFMT) != IFATTRDIR)) { return (ENOTDIR); } /* * Execute access is required to search the directory. * Access for write is interpreted as allowing * deletion of files in the directory. */ if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) { return (err); } ip = NULL; slot.fbp = NULL; slot.status = FOUND; /* don't need to look for empty slot */ rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER); rw_enter(&dp->i_contents, RW_WRITER); err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0); if (err) goto out_novfs; if (ip == NULL) { err = ENOENT; goto out_novfs; } vp = ITOV(ip); if (oip && oip != ip) { err = ENOENT; goto out_novfs; } mode = ip->i_mode & IFMT; if (mode == IFDIR || mode == IFATTRDIR) { /* * vn_vfswlock() prevents races between mount and rmdir. */ if (vn_vfswlock(vp)) { err = EBUSY; goto out_novfs; } if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) { err = EBUSY; goto out; } /* * If we are removing a directory, get a lock on it. * Taking a writer lock prevents a parallel ufs_dirlook from * incorrectly entering a negative cache vnode entry in the dnlc * If the directory is empty, it will stay empty until * we can remove it. */ if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) { /* * It is possible that a thread in rename would have * acquired this rwlock. To prevent a deadlock we * do a rw_tryenter. If we fail to get the lock * we drop all the locks we have acquired, wait * for 2 ticks and reacquire the * directory's (dp) i_rwlock and try again. * If we dont drop dp's i_rwlock then we will panic * with a "Deadlock: cycle in blocking chain" * since in ufs_dircheckpath we want dp's i_rwlock. * dp is guaranteed to exist since ufs_dirremove is * called after a VN_HOLD(dp) has been done. */ ufs_dirremove_retry_cnt++; vn_vfsunlock(vp); if (slot.fbp) fbrelse(slot.fbp, S_OTHER); rw_exit(&dp->i_contents); rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); rw_exit(&dp->i_rwlock); VN_RELE(vp); delay(2); rw_enter(&dp->i_rwlock, RW_WRITER); goto retry; } } rw_enter(&ip->i_contents, RW_READER); /* * Now check the restrictions that apply on sticky directories. */ if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) { rw_exit(&ip->i_contents); if (mode == IFDIR || mode == IFATTRDIR) rw_exit(&ip->i_rwlock); goto out; } if (op == DR_RMDIR) { /* * For rmdir(2), some special checks are required. * (a) Don't remove any alias of the parent (e.g. "."). * (b) Don't remove the current directory. * (c) Make sure the entry is (still) a directory. * (d) Make sure the directory is empty. */ if (dp == ip || vp == cdir) err = EINVAL; else if (((ip->i_mode & IFMT) != IFDIR) && ((ip->i_mode & IFMT) != IFATTRDIR)) err = ENOTDIR; else if ((ip->i_nlink > 2) || !ufs_dirempty(ip, dp->i_number, cr)) { err = EEXIST; /* SIGH should be ENOTEMPTY */ } if (err) { rw_exit(&ip->i_contents); if (mode == IFDIR || mode == IFATTRDIR) rw_exit(&ip->i_rwlock); goto out; } } else if (op == DR_REMOVE) { /* * unlink(2) requires a different check: allow only * privileged users to unlink a directory. */ if (vp->v_type == VDIR && secpolicy_fs_linkdir(cr, vp->v_vfsp)) { err = EPERM; rw_exit(&ip->i_contents); rw_exit(&ip->i_rwlock); goto out; } } rw_exit(&ip->i_contents); /* * Remove the cache'd entry, if any. */ dvp = ITOV(dp); dnlc_remove(dvp, namep); ep = slot.ep; ep->d_ino = 0; if (slot.cached) { dcanchor_t *dcap = &dp->i_danchor; (void) dnlc_dir_rem_entry(dcap, namep, NULL); if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) { (void) dnlc_dir_rem_space_by_handle(dcap, slot.offset); } if (slot.offset & (DIRBLKSIZ - 1)) { /* * Collapse new free space into previous entry. * Note, the previous entry has already been * validated in ufs_dircheckforname(). */ ASSERT(slot.size); pep = (struct direct *)((char *)ep - slot.size); if ((pep->d_ino == 0) && ((uintptr_t)pep & (DIRBLKSIZ - 1))) { dnlc_dir_purge(dcap); slot.cached = 0; goto nocache; } if (pep->d_ino) { extra = pep->d_reclen - DIRSIZ(pep); } else { extra = pep->d_reclen; } if (extra >= LDIRSIZ(1)) { (void) dnlc_dir_rem_space_by_handle(dcap, (uint64_t)(slot.offset - slot.size)); } pep->d_reclen += ep->d_reclen; (void) dnlc_dir_add_space(dcap, extra + ep->d_reclen, (uint64_t)(slot.offset - slot.size)); /* adjust the previous pointer in the next entry */ nep = (struct direct *)((char *)ep + ep->d_reclen); if ((uintptr_t)nep & (DIRBLKSIZ - 1)) { /* * Not a new block. * * Check the validity of the entry. * If it's bad, then throw away the cache and * continue. */ if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) || (dnlc_dir_update(dcap, nep->d_name, INO_OFF_TO_H(nep->d_ino, slot.offset - slot.size)) == DNOENT)) { dnlc_dir_purge(dcap); slot.cached = 0; } } } else { (void) dnlc_dir_add_space(dcap, ep->d_reclen, (uint64_t)slot.offset); } } else { /* * If the entry isn't the first in the directory, we must * reclaim the space of the now empty record by adding * the record size to the size of the previous entry. */ if (slot.offset & (DIRBLKSIZ - 1)) { /* * Collapse new free space into previous entry. */ pep = (struct direct *)((char *)ep - slot.size); pep->d_reclen += ep->d_reclen; } } nocache: err = TRANS_DIR(dp, slot.offset); if (err) fbrelse(slot.fbp, S_OTHER); else err = ufs_fbwrite(slot.fbp, dp); slot.fbp = NULL; /* * If we were removing a directory, it is 'gone' now, but we cannot * unlock it as a thread may be waiting for the lock in ufs_create. If * we did, it could then create a file in a deleted directory. */ if (err) { if (mode == IFDIR || mode == IFATTRDIR) rw_exit(&ip->i_rwlock); goto out; } rw_enter(&ip->i_contents, RW_WRITER); dp->i_flag |= IUPD|ICHG; dp->i_seq++; ip->i_flag |= ICHG; ip->i_seq++; TRANS_INODE(dp->i_ufsvfs, dp); TRANS_INODE(ip->i_ufsvfs, ip); /* * Now dispose of the inode. */ if (ip->i_nlink > 0) { /* * This is not done for IFATTRDIR's because they don't * have entries in the dnlc and the link counts are * not incremented when they are created. */ if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) { /* * Decrement by 2 because we're trashing the "." * entry as well as removing the entry in dp. * Clear the directory entry, but there may be * other hard links so don't free the inode. * Decrement the dp linkcount because we're * trashing the ".." entry. */ ip->i_nlink -= 2; dp->i_nlink--; ufs_setreclaim(dp); /* * XXX need to discard negative cache entries * for vp. See comment in ufs_delete(). */ dnlc_remove(vp, "."); dnlc_remove(vp, ".."); /* * The return value is ignored here bacause if * the directory purge fails we don't want to * stop the delete. If ufs_dirpurgedotdot fails * the delete will continue with the preexiting * behavior. */ (void) ufs_dirpurgedotdot(ip, dp->i_number, cr); } else { ip->i_nlink--; } ufs_setreclaim(ip); } ITIMES_NOLOCK(dp); ITIMES_NOLOCK(ip); if (!TRANS_ISTRANS(dp->i_ufsvfs)) ufs_iupdat(dp, I_SYNC); if (!TRANS_ISTRANS(ip->i_ufsvfs)) ufs_iupdat(ip, I_SYNC); rw_exit(&ip->i_contents); if (mode == IFDIR || mode == IFATTRDIR) rw_exit(&ip->i_rwlock); out: if (mode == IFDIR || mode == IFATTRDIR) { vn_vfsunlock(vp); } out_novfs: ASSERT(RW_WRITE_HELD(&dp->i_contents)); if (slot.fbp) fbrelse(slot.fbp, S_OTHER); rw_exit(&dp->i_contents); rw_exit(&dp->i_ufsvfs->vfs_dqrwlock); /* * If no error and vpp is non-NULL, return the vnode ptr to the caller. * The caller becomes responsible for the VN_RELE(). Otherwise, * Release (and delete) the inode after we drop vfs_dqrwlock to * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader. */ if (ip) { if ((err == 0) && (vpp != NULL)) { *vpp = ITOV(ip); } else { VN_RELE(vp); } } return (err); } /* * Return buffer with contents of block "offset" * from the beginning of directory "ip". If "res" * is non-zero, fill it in with a pointer to the * remaining space in the directory. * */ int blkatoff( struct inode *ip, off_t offset, char **res, struct fbuf **fbpp) { struct fs *fs; struct fbuf *fbp; daddr_t lbn; uint_t bsize; int err; CPU_STATS_ADD_K(sys, ufsdirblk, 1); fs = ip->i_fs; lbn = (daddr_t)lblkno(fs, offset); bsize = (uint_t)blksize(fs, ip, lbn); err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask), bsize, S_READ, &fbp); if (err) { *fbpp = (struct fbuf *)NULL; return (err); } if (res) *res = fbp->fb_addr + blkoff(fs, offset); *fbpp = fbp; return (0); } /* * Do consistency checking: * record length must be multiple of 4 * entry must fit in rest of its DIRBLKSIZ block * record must be large enough to contain entry * name is not longer than MAXNAMLEN * name must be as long as advertised, and null terminated * NOTE: record length must not be zero (should be checked previously). * This routine is only called if dirchk is true. * It would be nice to set the FSBAD flag in the super-block when * this routine fails so that a fsck is forced on next reboot, * but locking is a problem. */ static int dirmangled( struct inode *dp, struct direct *ep, int entryoffsetinblock, off_t offset) { int i; i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1)); if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i || (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN || ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) { dirbad(dp, "mangled entry", offset); return (1); } return (0); } static void dirbad(struct inode *ip, char *how, off_t offset) { cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s", ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how); } static int dirbadname(char *sp, int l) { while (l--) { /* check for nulls */ if (*sp++ == '\0') { return (1); } } return (*sp); /* check for terminating null */ } /* * Check if a directory is empty or not. */ static int ufs_dirempty( struct inode *ip, ino_t parentino, struct cred *cr) { return (ufs_dirscan(ip, parentino, cr, 0)); } /* * clear the .. directory entry. */ static int ufs_dirpurgedotdot( struct inode *ip, ino_t parentino, struct cred *cr) { return (ufs_dirscan(ip, parentino, cr, 1)); } /* * Scan the directoy. If clr_dotdot is true clear the .. * directory else check to see if the directory is empty. * * Using a struct dirtemplate here is not precisely * what we want, but better than using a struct direct. * * clr_dotdot is used as a flag to tell us if we need * to clear the dotdot entry * * N.B.: does not handle corrupted directories. */ static int ufs_dirscan( struct inode *ip, ino_t parentino, struct cred *cr, int clr_dotdot) { offset_t off; struct dirtemplate dbuf; struct direct *dp = (struct direct *)&dbuf; int err, count; int empty = 1; /* Assume it's empty */ #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) ASSERT(RW_LOCK_HELD(&ip->i_contents)); ASSERT(ip->i_size <= (offset_t)MAXOFF_T); for (off = 0; off < ip->i_size; off += dp->d_reclen) { err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); /* * Since we read MINDIRSIZ, residual must * be 0 unless we're at end of file. */ if (err || count != 0 || dp->d_reclen == 0) { empty = 0; break; } /* skip empty entries */ if (dp->d_ino == 0) continue; /* accept only "." and ".." */ if (dp->d_namlen > 2 || dp->d_name[0] != '.') { empty = 0; break; } /* * At this point d_namlen must be 1 or 2. * 1 implies ".", 2 implies ".." if second * char is also "." */ if (dp->d_namlen == 1) continue; if (dp->d_name[1] == '.' && (ino_t)dp->d_ino == parentino) { /* * If we're doing a purge we need to check for * the . and .. entries and clear the d_ino for .. * * if clr_dotdot is set ufs_dirscan does not * check for an empty directory. */ if (clr_dotdot) { /* * Have to actually zap the .. * entry in the directory, as * otherwise someone might have * dp as its cwd and try to * open .., which now points to * an unallocated inode. */ empty = ufs_dirclrdotdot(ip, parentino); break; } else { continue; } } empty = 0; break; } return (empty); } clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */ uint64_t dircheck_retry_cnt; /* * Check if source directory inode is in the path of the target directory. * Target is supplied locked. * * The source and target inode's should be different upon entry. */ int ufs_dircheckpath( ino_t source_ino, struct inode *target, struct inode *sdp, struct cred *cr) { struct fbuf *fbp; struct dirtemplate *dirp; struct inode *ip; struct ufsvfs *ufsvfsp; struct inode *tip; ino_t dotdotino; int err; ASSERT(target->i_ufsvfs != NULL); ASSERT(RW_LOCK_HELD(&target->i_rwlock)); ASSERT(RW_LOCK_HELD(&sdp->i_rwlock)); ip = target; if (ip->i_number == source_ino) { err = EINVAL; goto out; } if (ip->i_number == UFSROOTINO) { err = 0; goto out; } /* * Search back through the directory tree, using the ".." entries. * Fail any attempt to move a directory into an ancestor directory. */ fbp = NULL; for (;;) { struct vfs *vfs; err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp); if (err) break; if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 || ip->i_size < sizeof (struct dirtemplate)) { dirbad(ip, "bad size, unlinked or not dir", (off_t)0); err = ENOTDIR; break; } if (dirp->dotdot_namlen != 2 || dirp->dotdot_name[0] != '.' || dirp->dotdot_name[1] != '.') { dirbad(ip, "mangled .. entry", (off_t)0); err = ENOTDIR; /* Sanity check */ break; } dotdotino = (ino_t)dirp->dotdot_ino; if (dotdotino == source_ino) { err = EINVAL; break; } if (dotdotino == UFSROOTINO) break; if (fbp) { fbrelse(fbp, S_OTHER); fbp = NULL; } vfs = ip->i_vfs; ufsvfsp = ip->i_ufsvfs; if (ip != target) { rw_exit(&ip->i_rwlock); VN_RELE(ITOV(ip)); } /* * Race to get the inode. */ rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) { rw_exit(&ufsvfsp->vfs_dqrwlock); ip = NULL; break; } rw_exit(&ufsvfsp->vfs_dqrwlock); /* * If the directory of the source inode (also a directory) * is the same as this next entry up the chain, then * we know the source directory itself can't be in the * chain. This also prevents a panic because we already * have sdp->i_rwlock locked. */ if (tip == sdp) { VN_RELE(ITOV(tip)); ip = NULL; break; } ip = tip; /* * If someone has set the WRITE_WANTED bit in this lock and if * this happens to be a sdp or tdp of another parallel rename * which is executing the same code and in similar situation * we end up in a 4 way deadlock. We need to make sure that * the WRITE_WANTED bit is not set. */ retry_lock: if (!rw_tryenter(&ip->i_rwlock, RW_READER)) { /* * If the lock held as WRITER thats fine but if it * has WRITE_WANTED bit set we might end up in a * deadlock. If WRITE_WANTED is set we return * with EAGAIN else we just go back and try. */ if (RW_ISWRITER(&ip->i_rwlock) && !(RW_WRITE_HELD(&ip->i_rwlock))) { err = EAGAIN; if (fbp) { fbrelse(fbp, S_OTHER); } VN_RELE(ITOV(ip)); return (err); } else { /* * The lock is being write held. We could * just do a rw_enter here but there is a * window between the check and now, where * the status could have changed, so to * avoid looping we backoff and go back to * try for the lock. */ delay(retry_backoff_delay); dircheck_retry_cnt++; goto retry_lock; } } } if (fbp) { fbrelse(fbp, S_OTHER); } out: if (ip) { if (ip != target) { rw_exit(&ip->i_rwlock); VN_RELE(ITOV(ip)); } } return (err); } int ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr) { offset_t off; struct dirtemplate dbuf; struct direct *dp = (struct direct *)&dbuf; int err, count; int empty = 1; /* Assume it's empty */ #define MINDIRSIZ (sizeof (struct dirtemplate) / 2) ASSERT(RW_LOCK_HELD(&ip->i_contents)); ASSERT(ip->i_size <= (offset_t)MAXOFF_T); for (off = 0; off < ip->i_size; off += dp->d_reclen) { err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp, (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr); /* * Since we read MINDIRSIZ, residual must * be 0 unless we're at end of file. */ if (err || count != 0 || dp->d_reclen == 0) { empty = 0; break; } /* skip empty entries */ if (dp->d_ino == 0) continue; /* * At this point d_namlen must be 1 or 2. * 1 implies ".", 2 implies ".." if second * char is also "." */ if (dp->d_namlen == 1 && dp->d_name[0] == '.' && (ino_t)dp->d_ino == parentino) continue; if (dp->d_namlen == 2 && dp->d_name[0] == '.' && dp->d_name[1] == '.') { continue; } empty = 0; break; } return (empty); } /* * Allocate and initialize a new shadow inode to contain extended attributes. */ int ufs_xattrmkdir( struct inode *tdp, struct inode **ipp, int flags, struct cred *cr) { struct inode *ip; struct vattr va; int err; int retry = 1; struct ufsvfs *ufsvfsp; struct ulockfs *ulp; int issync; int trans_size; int dorwlock; /* 0 = not yet taken, */ /* 1 = taken outside the transaction, */ /* 2 = taken inside the transaction */ /* * Validate permission to create attribute directory */ if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) { return (err); } if (vn_is_readonly(ITOV(tdp))) return (EROFS); /* * No need to re-init err after again:, since it's set before * the next use of it. */ again: dorwlock = 0; va.va_type = VDIR; va.va_uid = tdp->i_uid; va.va_gid = tdp->i_gid; if ((tdp->i_mode & IFMT) == IFDIR) { va.va_mode = (o_mode_t)IFATTRDIR; va.va_mode |= tdp->i_mode & 0777; } else { va.va_mode = (o_mode_t)IFATTRDIR|0700; if (tdp->i_mode & 0040) va.va_mode |= 0750; if (tdp->i_mode & 0004) va.va_mode |= 0705; } va.va_mask = AT_TYPE|AT_MODE; ufsvfsp = tdp->i_ufsvfs; err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK); if (err) return (err); /* * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file. * This follows the protocol for read()/write(). */ if (ITOV(tdp)->v_type != VDIR) { rw_enter(&tdp->i_rwlock, RW_WRITER); dorwlock = 1; } if (ulp) { trans_size = (int)TOP_MKDIR_SIZE(tdp); TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size); } /* * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory. * This follows the protocol established by * ufs_link/create/remove/rename/mkdir/rmdir/symlink. */ if (dorwlock == 0) { rw_enter(&tdp->i_rwlock, RW_WRITER); dorwlock = 2; } rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); rw_enter(&tdp->i_contents, RW_WRITER); /* * Suppress out of inodes messages if we will retry. */ if (retry) tdp->i_flag |= IQUIET; err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr); tdp->i_flag &= ~IQUIET; if (err) goto fail; if (flags) { /* * Now attach it to src file. */ tdp->i_oeftflag = ip->i_number; } ip->i_cflags |= IXATTR; ITOV(ip)->v_flag |= V_XATTRDIR; TRANS_INODE(ufsvfsp, tdp); tdp->i_flag |= ICHG | IUPD; tdp->i_seq++; ufs_iupdat(tdp, I_SYNC); rw_exit(&tdp->i_contents); rw_exit(&ufsvfsp->vfs_dqrwlock); rw_enter(&ip->i_rwlock, RW_WRITER); rw_enter(&ip->i_contents, RW_WRITER); TRANS_INODE(ufsvfsp, ip); ip->i_flag |= ICHG| IUPD; ip->i_seq++; ufs_iupdat(ip, I_SYNC); rw_exit(&ip->i_contents); rw_exit(&ip->i_rwlock); if (dorwlock == 2) rw_exit(&tdp->i_rwlock); if (ulp) { int terr = 0; TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); ufs_lockfs_end(ulp); if (err == 0) err = terr; } if (dorwlock == 1) rw_exit(&tdp->i_rwlock); *ipp = ip; return (err); fail: rw_exit(&tdp->i_contents); rw_exit(&ufsvfsp->vfs_dqrwlock); if (dorwlock == 2) rw_exit(&tdp->i_rwlock); if (ulp) { TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size); ufs_lockfs_end(ulp); } if (dorwlock == 1) rw_exit(&tdp->i_rwlock); if (ip != NULL) VN_RELE(ITOV(ip)); /* * No inodes? See if any are tied up in pending deletions. * This has to be done outside of any of the above, because * the draining operation can't be done from inside a transaction. */ if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) { ufs_delete_drain_wait(ufsvfsp, 1); retry = 0; goto again; } return (err); } /* * clear the dotdot directory entry. * Used by ufs_dirscan when clr_dotdot * flag is set and we're deleting a * directory. */ static int ufs_dirclrdotdot(struct inode *ip, ino_t parentino) { struct fbuf *fbp; struct direct *dotp, *dotdotp; int err = 0; ASSERT(RW_WRITE_HELD(&ip->i_rwlock)); ASSERT(RW_LOCK_HELD(&ip->i_contents)); err = blkatoff(ip, 0, NULL, &fbp); if (err) { return (err); } dotp = (struct direct *)fbp->fb_addr; if ((dotp->d_namlen < (MAXNAMLEN + 1)) && ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) { dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen); if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) && ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) { dotp->d_reclen += dotdotp->d_reclen; if (parentino == dotdotp->d_ino) { dotdotp->d_ino = 0; dotdotp->d_namlen = 0; dotdotp->d_reclen = 0; } err = TRANS_DIR(ip, 0); if (err) { fbrelse(fbp, S_OTHER); } else { err = ufs_fbwrite(fbp, ip); } } } else { err = -1; } return (err); }