fs/ufs/ufs_dir.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
/*	  All Rights Reserved  	*/

/*
 * University Copyright- Copyright (c) 1982, 1986, 1988
 * The Regents of the University of California
 * All Rights Reserved
 *
 * University Acknowledgment- Portions of this document are derived from
 * software developed by the University of California, Berkeley, and its
 * contributors.
 */


#pragma ident	"%Z%%M%	%I%	%E% SMI"

/*
 * Directory manipulation routines.
 *
 * When manipulating directories, the i_rwlock provides serialization
 * since directories cannot be mmapped. The i_contents lock is redundant.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/user.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#include <sys/mode.h>
#include <sys/buf.h>
#include <sys/uio.h>
#include <sys/dnlc.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fs.h>
#include <sys/mount.h>
#include <sys/fs/ufs_fsdir.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_panic.h>
#include <sys/fs/ufs_quota.h>
#include <sys/errno.h>
#include <sys/debug.h>
#include <vm/seg.h>
#include <sys/sysmacros.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/unistd.h>
#include <sys/policy.h>

/*
 * This is required since we're using P2ROUNDUP_TYPED on DIRBLKSIZ
 */
#if !ISP2(DIRBLKSIZ)
#error	"DIRBLKSIZ not a power of 2"
#endif

/*
 * A virgin directory.
 */
static struct dirtemplate mastertemplate = {
	0, 12, 1, ".",
	0, DIRBLKSIZ - 12, 2, ".."
};

#define	LDIRSIZ(len) \
	((sizeof (struct direct) - (MAXNAMLEN + 1)) + ((len + 1 + 3) &~ 3))
#define	MAX_DIR_NAME_LEN(len) \
	(((len) - (sizeof (struct direct) - (MAXNAMLEN + 1))) - 1)

/*
 * The dnlc directory cache allows a 64 bit handle for directory entries.
 * For ufs we squeeze both the 32 bit inumber and a 32 bit disk offset
 * into the handle. Note, a 32 bit offset allows a 4GB directory, which
 * is way beyond what could be cached in memory by the directory
 * caching routines. So we are quite safe with this limit.
 * The macros below pack and unpack the handle.
 */
#define	H_TO_INO(h) (uint32_t)((h) & UINT_MAX)
#define	H_TO_OFF(h) (off_t)((h) >> 32)
#define	INO_OFF_TO_H(ino, off) (uint64_t)(((uint64_t)(off) << 32) | (ino))

/*
 * The average size of a typical on disk directory entry is about 16 bytes
 * and so defines AV_DIRECT_SHIFT : log2(16)
 * This define is only used to approximate the number of entries
 * is a directory. This is needed for dnlc_dir_start() which will immediately
 * return an error if the value is not within its acceptable range of
 * number of files in a directory.
 */
#define	AV_DIRECT_SHIFT 4
/*
 * If the directory size (from i_size) is greater than the ufs_min_dir_cache
 * tunable then we request dnlc directory caching.
 * This has found to be profitable after 1024 file names.
 */
int ufs_min_dir_cache = 1024 << AV_DIRECT_SHIFT;

/* The time point the dnlc directory caching was disabled */
static hrtime_t ufs_dc_disable_at;
/* directory caching disable duration */
static hrtime_t ufs_dc_disable_duration = (hrtime_t)NANOSEC * 5;

#ifdef DEBUG
int dirchk = 1;
#else /* !DEBUG */
int dirchk = 0;
#endif /* DEBUG */
int ufs_negative_cache = 1;
uint64_t ufs_dirremove_retry_cnt;

static void dirbad();
static int ufs_dirrename();
static int ufs_diraddentry();
static int ufs_dirempty();
static int ufs_dirscan();
static int ufs_dirclrdotdot();
static int ufs_dirfixdotdot();
static int ufs_dirpurgedotdot();
static int dirprepareentry();
static int ufs_dirmakedirect();
static int dirbadname();
static int dirmangled();

/*
 * Look for a given name in a directory.  On successful return, *ipp
 * will point to the VN_HELD inode.
 */
int
ufs_dirlook(
	struct inode *dp,
	char *namep,
	struct inode **ipp,
	struct cred *cr,
	int skipdnlc)			/* skip the 1st level dnlc */
{
	uint64_t handle;
	struct fbuf *fbp;		/* a buffer of directory entries */
	struct direct *ep;		/* the current directory entry */
	struct vnode *vp;
	struct vnode *dvp;		/* directory vnode ptr */
	dcanchor_t *dcap;
	off_t endsearch;		/* offset to end directory search */
	off_t offset;
	off_t start_off;		/* starting offset from middle search */
	off_t last_offset;		/* last offset */
	int entryoffsetinblock;		/* offset of ep in addr's buffer */
	int numdirpasses;		/* strategy for directory search */
	int namlen;			/* length of name */
	int err;
	int doingchk;
	int i;
	int caching;
	ino_t ep_ino;			/* entry i number */
	ino_t chkino;
	ushort_t ep_reclen;		/* direct local d_reclen */

	ASSERT(*namep != '\0'); /* All callers ensure *namep is non null */

	/*
	 * Check accessibility of directory.
	 */
	if (((dp->i_mode & IFMT) != IFDIR) &&
	    ((dp->i_mode & IFMT) != IFATTRDIR))
		return (ENOTDIR);

	if (err = ufs_iaccess(dp, IEXEC, cr))
		return (err);

	/*
	 * Check the directory name lookup cache, first for individual files
	 * then for complete directories.
	 */
	dvp = ITOV(dp);
	if (!skipdnlc && (vp = dnlc_lookup(dvp, namep))) {
		/* vp is already held from dnlc_lookup */
		if (vp == DNLC_NO_VNODE) {
			VN_RELE(vp);
			return (ENOENT);
		}
		*ipp = VTOI(vp);
		return (0);
	}

	dcap = &dp->i_danchor;

	/*
	 * Grab the reader lock on the directory data before checking
	 * the dnlc to avoid a race with ufs_dirremove() & friends.
	 */
	rw_enter(&dp->i_rwlock, RW_READER);

	switch (dnlc_dir_lookup(dcap, namep, &handle)) {
	case DFOUND:
		ep_ino = (ino_t)H_TO_INO(handle);
		if (dp->i_number == ep_ino) {
			VN_HOLD(dvp);	/* want ourself, "." */
			*ipp = dp;
			rw_exit(&dp->i_rwlock);
			return (0);
		}
		if (namep[0] == '.' && namep[1] == '.' && namep[2] == 0) {
			uint64_t handle2;
			/*
			 * release the lock on the dir we are searching
			 * to avoid a deadlock when grabbing the
			 * i_contents lock in ufs_iget_alloced().
			 */
			rw_exit(&dp->i_rwlock);
			rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
			err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
			/*
			 * must recheck as we dropped dp->i_rwlock
			 */
			rw_enter(&dp->i_rwlock, RW_READER);
			if (!err && (dnlc_dir_lookup(dcap, namep, &handle2)
			    == DFOUND) && (handle == handle2)) {
				dnlc_update(dvp, namep, ITOV(*ipp));
				rw_exit(&dp->i_rwlock);
				return (0);
			}
			/* check failed, read the actual directory */
			if (!err) {
				VN_RELE(ITOV(*ipp));
			}
			goto restart;
		}
		/* usual case of not "." nor ".." */
		rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
		err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp, cr);
		rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
		if (err) {
			rw_exit(&dp->i_rwlock);
			return (err);
		}
		dnlc_update(dvp, namep, ITOV(*ipp));
		rw_exit(&dp->i_rwlock);
		return (0);
	case DNOENT:
		if (ufs_negative_cache && (dp->i_nlink > 0)) {
			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
		}
		rw_exit(&dp->i_rwlock);
		return (ENOENT);
	default:
		break;
	}
restart:

	fbp = NULL;
	doingchk = 0;
	chkino = 0;
	caching = 0;

	/*
	 * Attempt to cache any directories greater than the tunable
	 * ufs_min_cache_dir. If it fails due to memory shortage (DNOMEM),
	 * disable caching for this directory and record the system time.
	 * Any attempt after the disable time has expired will enable
	 * the caching again.
	 */
	if (dp->i_size >= ufs_min_dir_cache) {
		/*
		 * if the directory caching disable time has expired
		 * enable the caching again.
		 */
		if (dp->i_cachedir == CD_DISABLED_NOMEM &&
		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
			ufs_dc_disable_at = 0;
			dp->i_cachedir = CD_ENABLED;
		}
		if (dp->i_cachedir == CD_ENABLED) {
			switch (dnlc_dir_start(dcap, dp->i_size >>
				AV_DIRECT_SHIFT)) {
			case DNOMEM:
				dp->i_cachedir = CD_DISABLED_NOMEM;
				ufs_dc_disable_at = gethrtime();
				break;
			case DTOOBIG:
				dp->i_cachedir = CD_DISABLED_TOOBIG;
				break;
			case DOK:
				caching = 1;
				break;
			default:
				break;
			}
		}
	}
	/*
	 * If caching we don't stop when the file has been
	 * found, but need to know later, so clear *ipp now
	 */
	*ipp = NULL;

recheck:
	if (caching) {
		offset = 0;
		entryoffsetinblock = 0;
		numdirpasses = 1;
	} else {
		/*
		 * Take care to look at dp->i_diroff only once, as it
		 * may be changing due to other threads/cpus.
		 */
		offset = dp->i_diroff;
		if (offset > dp->i_size) {
			offset = 0;
		}
		if (offset == 0) {
			entryoffsetinblock = 0;
			numdirpasses = 1;
		} else {
			start_off = offset;

			entryoffsetinblock = blkoff(dp->i_fs, offset);
			if (entryoffsetinblock != 0) {
				err = blkatoff(dp, offset, (char **)0, &fbp);
				if (err)
					goto bad;
			}
			numdirpasses = 2;
		}
	}
	endsearch = P2ROUNDUP_TYPED(dp->i_size, DIRBLKSIZ, u_offset_t);
	namlen = strlen(namep);
	last_offset = 0;

searchloop:
	while (offset < endsearch) {
		/*
		 * If offset is on a block boundary,
		 * read the next directory block.
		 * Release previous if it exists.
		 */
		if (blkoff(dp->i_fs, offset) == 0) {
			if (fbp != NULL) {
				fbrelse(fbp, S_OTHER);
			}
			err = blkatoff(dp, offset, (char **)0, &fbp);
			if (err)
				goto bad;
			entryoffsetinblock = 0;
		}

		/*
		 * If the offset to the next entry is invalid or if the
		 * next entry is a zero length record or if the record
		 * length is invalid, then skip to the next directory
		 * block.  Complete validation checks are done if the
		 * record length is invalid.
		 *
		 * Full validation checks are slow so they are disabled
		 * by default.  Complete checks can be run by patching
		 * "dirchk" to be true.
		 *
		 * We have to check the validity of entryoffsetinblock
		 * here because it can be set to i_diroff above.
		 */
		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblock);
		if ((entryoffsetinblock & 0x3) || ep->d_reclen == 0 ||
		    (dirchk || (ep->d_reclen & 0x3)) &&
		    dirmangled(dp, ep, entryoffsetinblock, offset)) {
			i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
			offset += i;
			entryoffsetinblock += i;
			if (caching) {
				dnlc_dir_purge(dcap);
				caching = 0;
			}
			continue;
		}

		ep_reclen = ep->d_reclen;

		/*
		 * Add named entries and free space into the directory cache
		 */
		if (caching) {
			ushort_t extra;
			off_t off2;

			if (ep->d_ino == 0) {
				extra = ep_reclen;
				if (offset & (DIRBLKSIZ - 1)) {
					dnlc_dir_purge(dcap);
					dp->i_cachedir = CD_DISABLED;
					caching = 0;
				}
			} else {
				/*
				 * entries hold the previous offset except the
				 * 1st which holds the offset + 1
				 */
				if (offset & (DIRBLKSIZ - 1)) {
					off2 = last_offset;
				} else {
					off2 = offset + 1;
				}
				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
				extra = ep_reclen - DIRSIZ(ep);
			}
			if (caching && (extra >= LDIRSIZ(1))) {
				caching = (dnlc_dir_add_space(dcap, extra,
				    (uint64_t)offset) == DOK);
			}
		}

		/*
		 * Check for a name match.
		 * We have the parent inode read locked with i_rwlock.
		 */
		if (ep->d_ino && ep->d_namlen == namlen &&
		    *namep == *ep->d_name &&	/* fast chk 1st chr */
		    bcmp(namep, ep->d_name, (int)ep->d_namlen) == 0) {

			/*
			 * We have to release the fbp early here to avoid
			 * a possible deadlock situation where we have the
			 * fbp and want the directory inode and someone doing
			 * a ufs_direnter_* has the directory inode and wants
			 * the fbp.  XXX - is this still needed?
			 */
			ep_ino = (ino_t)ep->d_ino;
			ASSERT(fbp != NULL);
			fbrelse(fbp, S_OTHER);
			fbp = NULL;

			/*
			 * Atomic update (read lock held)
			 */
			dp->i_diroff = offset;

			if (namlen == 2 && namep[0] == '.' && namep[1] == '.') {
				struct timeval32 omtime;

				if (caching) {
					dnlc_dir_purge(dcap);
					caching = 0;
				}
				if (doingchk) {
					/*
					 * if the inumber didn't change
					 * continue with already found inode.
					 */
					if (ep_ino == chkino)
						goto checkok;
					else {
						VN_RELE(ITOV(*ipp));
						/* *ipp is nulled at restart */
						goto restart;
					}
				}
				/*
				 * release the lock on the dir we are searching
				 * to avoid a deadlock when grabbing the
				 * i_contents lock in ufs_iget_alloced().
				 */
				omtime = dp->i_mtime;
				rw_exit(&dp->i_rwlock);
				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
						RW_READER);
				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
				    cr);
				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
				rw_enter(&dp->i_rwlock, RW_READER);
				if (err)
					goto bad;
				/*
				 * Since we released the lock on the directory,
				 * we must check that the same inode is still
				 * the ".." entry for this directory.
				 */
				/*CSTYLED*/
				if (timercmp(&omtime, &dp->i_mtime, !=)) {
					/*
					 * Modification time changed on the
					 * directory, we must go check if
					 * the inumber changed for ".."
					 */
					doingchk = 1;
					chkino = ep_ino;
					entryoffsetinblock = 0;
					if (caching) {
						/*
						 * Forget directory caching
						 * for this rare case
						 */
						dnlc_dir_purge(dcap);
						caching = 0;
					}
					goto recheck;
				}
			} else if (dp->i_number == ep_ino) {
				VN_HOLD(dvp);	/* want ourself, "." */
				*ipp = dp;
				if (caching) {
					dnlc_dir_purge(dcap);
					caching = 0;
				}
			} else {
				rw_enter(&dp->i_ufsvfs->vfs_dqrwlock,
						RW_READER);
				err = ufs_iget_alloced(dp->i_vfs, ep_ino, ipp,
				    cr);
				rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
				if (err)
					goto bad;
			}
checkok:
			ASSERT(*ipp);
			dnlc_update(dvp, namep, ITOV(*ipp));
			/*
			 * If we are not caching then just return the entry
			 * otherwise complete loading up the cache
			 */
			if (!caching) {
				rw_exit(&dp->i_rwlock);
				return (0);
			}
			err = blkatoff(dp, offset, (char **)0, &fbp);
			if (err)
				goto bad;
		}
		last_offset = offset;
		offset += ep_reclen;
		entryoffsetinblock += ep_reclen;
	}
	/*
	 * If we started in the middle of the directory and failed
	 * to find our target, we must check the beginning as well.
	 */
	if (numdirpasses == 2) {
		numdirpasses--;
		offset = 0;
		endsearch = start_off;
		goto searchloop;
	}

	/*
	 * If whole directory caching is on (or was originally on) then
	 * the entry may have been found.
	 */
	if (*ipp == NULL) {
		err = ENOENT;
		if (ufs_negative_cache && (dp->i_nlink > 0)) {
			dnlc_enter(dvp, namep, DNLC_NO_VNODE);
		}
	}
	if (caching) {
		dnlc_dir_complete(dcap);
		caching = 0;
	}

bad:
	if (err && *ipp) {
		/*
		 * err and *ipp can both be set if we were attempting to
		 * cache the directory, and we found the entry, then later
		 * while trying to complete the directory cache encountered
		 * a error (eg reading a directory sector).
		 */
		VN_RELE(ITOV(*ipp));
		*ipp = NULL;
	}

	if (fbp)
		fbrelse(fbp, S_OTHER);
	rw_exit(&dp->i_rwlock);
	if (caching)
		dnlc_dir_purge(dcap);
	return (err);
}

/*
 * Write a new directory entry for DE_CREATE or DE_MKDIR operations.
 */
int
ufs_direnter_cm(
	struct inode *tdp,	/* target directory to make entry in */
	char *namep,		/* name of entry */
	enum de_op op,		/* entry operation */
	struct vattr *vap,	/* attributes if new inode needed */
	struct inode **ipp,	/* return entered inode here */
	struct cred *cr,	/* user credentials */
	int flags)		/* no entry exists */
{
	struct inode *tip;	/* inode of (existing) target file */
	char *s;
	struct ufs_slot slot;	/* slot info to pass around */
	int namlen;		/* length of name */
	int err;		/* error number */
	struct inode *nip;	/* new inode */
	int do_rele_nip = 0;	/* release nip */
	int noentry = flags & ~IQUIET;
	int quiet = flags & IQUIET;	/* Suppress out of inodes message */

	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));

	if (((tdp->i_mode & IFMT) == IFATTRDIR) && ((op == DE_MKDIR) ||
	    ((vap->va_type == VCHR) || (vap->va_type == VBLK) ||
	    (vap->va_type == VDOOR) || (vap->va_type == VSOCK) ||
	    (vap->va_type == VFIFO))))
		return (EINVAL);

	/* don't allow '/' characters in pathname component */
	for (s = namep, namlen = 0; *s; s++, namlen++)
		if (*s == '/')
			return (EACCES);
	ASSERT(namlen);

	/*
	 * If name is "." or ".." then if this is a create look it up
	 * and return EEXIST.
	 */
	if (namep[0] == '.' &&
	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
		/*
		 * ufs_dirlook will acquire the i_rwlock
		 */
		rw_exit(&tdp->i_rwlock);
		if (err = ufs_dirlook(tdp, namep, ipp, cr, 0)) {
			rw_enter(&tdp->i_rwlock, RW_WRITER);
			return (err);
		}
		rw_enter(&tdp->i_rwlock, RW_WRITER);
		return (EEXIST);
	}

	/*
	 * If target directory has not been removed, then we can consider
	 * allowing file to be created.
	 */
	if (tdp->i_nlink <= 0) {
		return (ENOENT);
	}

	/*
	 * Check accessibility of directory.
	 */
	if (((tdp->i_mode & IFMT) != IFDIR) &&
	    ((tdp->i_mode & IFMT) != IFATTRDIR)) {
		return (ENOTDIR);
	}

	/*
	 * Execute access is required to search the directory.
	 */
	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
		return (err);
	}

	/*
	 * Search for the entry. Return VN_HELD tip if found.
	 */
	tip = NULL;
	slot.fbp = NULL;
	slot.status = NONE;
	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
	rw_enter(&tdp->i_contents, RW_WRITER);
	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, noentry);
	if (err)
		goto out;
	if (tip) {
		ASSERT(!noentry);
		*ipp = tip;
		err = EEXIST;
	} else {
		/*
		 * The entry does not exist. Check write permission in
		 * directory to see if entry can be created.
		 */
		if (err = ufs_iaccess(tdp, IWRITE, cr))
			goto out;
		/*
		 * Make new inode and directory entry.
		 */
		tdp->i_flag |= quiet;
		if (err = ufs_dirmakeinode(tdp, &nip, vap, op, cr)) {
			if (nip != NULL)
				do_rele_nip = 1;
			goto out;
		}
		if (err = ufs_diraddentry(tdp, namep, op,
		    namlen, &slot, nip, NULL, cr)) {
			/*
			 * Unmake the inode we just made.
			 */
			rw_enter(&nip->i_contents, RW_WRITER);
			if (((nip->i_mode & IFMT) == IFDIR) ||
			    ((nip->i_mode & IFMT) == IFATTRDIR)) {
				tdp->i_nlink--;
				ufs_setreclaim(tdp);
				tdp->i_flag |= ICHG;
				tdp->i_seq++;
				TRANS_INODE(tdp->i_ufsvfs, tdp);
				ITIMES_NOLOCK(tdp);
			}
			nip->i_nlink = 0;
			ufs_setreclaim(nip);
			TRANS_INODE(nip->i_ufsvfs, nip);
			nip->i_flag |= ICHG;
			nip->i_seq++;
			ITIMES_NOLOCK(nip);
			rw_exit(&nip->i_contents);
			do_rele_nip = 1;
		} else {
			*ipp = nip;
		}
	}

out:
	if (slot.fbp)
		fbrelse(slot.fbp, S_OTHER);

	tdp->i_flag &= ~quiet;
	rw_exit(&tdp->i_contents);

	/*
	 * Drop vfs_dqrwlock before calling VN_RELE() on nip to
	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
	 */
	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);

	if (do_rele_nip) {
		VN_RELE(ITOV(nip));
	}

	return (err);
}

/*
 * Write a new directory entry for DE_LINK, DE_SYMLINK or DE_RENAME operations.
 * If tvpp is non-null, return with the pointer to the target vnode.
 */
int
ufs_direnter_lr(
	struct inode *tdp,	/* target directory to make entry in */
	char *namep,		/* name of entry */
	enum de_op op,		/* entry operation */
	struct inode *sdp,	/* source inode parent if rename */
	struct inode *sip,	/* source inode */
	struct cred *cr,	/* user credentials */
	vnode_t **tvpp)		/* Return: (held) vnode of (existing) target */
{
	struct inode *tip;	/* inode of (existing) target file */
	char *s;
	struct ufs_slot slot;	/* slot info to pass around */
	int namlen;		/* length of name */
	int err;		/* error number */

	/* don't allow '/' characters in pathname component */
	for (s = namep, namlen = 0; *s; s++, namlen++)
		if (*s == '/')
			return (EACCES);
	ASSERT(namlen);
	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));

	/*
	 * If name is "." or ".." then if this is a create look it up
	 * and return EEXIST.  Rename or link TO "." or ".." is forbidden.
	 */
	if (namep[0] == '.' &&
	    (namlen == 1 || (namlen == 2 && namep[1] == '.'))) {
		if (op == DE_RENAME) {
			return (EINVAL);	/* *SIGH* should be ENOTEMPTY */
		}
		return (EEXIST);
	}
	/*
	 * For link and rename lock the source entry and check the link count
	 * to see if it has been removed while it was unlocked.  If not, we
	 * increment the link count and force the inode to disk to make sure
	 * that it is there before any directory entry that points to it.
	 *
	 * In the case of a symbolic link, we are dealing with a new inode
	 * which does not yet have any links.  We've created it with a link
	 * count of 1, and we don't want to increment it since this will be
	 * its first link.
	 *
	 * We are about to push the inode to disk. We make sure
	 * that the inode's data blocks are flushed first so the
	 * inode and it's data blocks are always in sync.  This
	 * adds some robustness in in the event of a power failure
	 * or panic where sync fails. If we panic before the
	 * inode is updated, then the inode still refers to the
	 * old data blocks (or none for a new file). If we panic
	 * after the inode is updated, then the inode refers to
	 * the new data blocks.
	 *
	 * We do this before grabbing the i_contents lock because
	 * ufs_syncip() will want that lock. We could do the data
	 * syncing after the removal checks, but upon return from
	 * the data sync we would have to repeat the removal
	 * checks.
	 */
	if (err = TRANS_SYNCIP(sip, 0, I_DSYNC, TOP_FSYNC)) {
		return (err);
	}

	rw_enter(&sip->i_contents, RW_WRITER);
	if (sip->i_nlink <= 0) {
		rw_exit(&sip->i_contents);
		return (ENOENT);
	}
	if (sip->i_nlink == MAXLINK) {
		rw_exit(&sip->i_contents);
		return (EMLINK);
	}

	/*
	 * Sync the indirect blocks associated with the file
	 * for the same reasons as described above.  Since this
	 * call wants the i_contents lock held for it we can do
	 * this here with no extra work.
	 */
	if (err = ufs_sync_indir(sip)) {
		rw_exit(&sip->i_contents);
		return (err);
	}

	if (op != DE_SYMLINK)
		sip->i_nlink++;
	TRANS_INODE(sip->i_ufsvfs, sip);
	sip->i_flag |= ICHG;
	sip->i_seq++;
	ufs_iupdat(sip, I_SYNC);
	rw_exit(&sip->i_contents);

	/*
	 * If target directory has not been removed, then we can consider
	 * allowing file to be created.
	 */
	if (tdp->i_nlink <= 0) {
		err = ENOENT;
		goto out2;
	}
	/*
	 * Check accessibility of directory.
	 */
	if (((tdp->i_mode & IFMT) != IFDIR) &&
	    (tdp->i_mode & IFMT) != IFATTRDIR) {
		err = ENOTDIR;
		goto out2;
	}
	/*
	 * Execute access is required to search the directory.
	 */
	if (err = ufs_iaccess(tdp, IEXEC, cr)) {
		goto out2;
	}

	/*
	 * Search for the entry. Return VN_HELD tip if found.
	 */
	tip = NULL;
	slot.status = NONE;
	slot.fbp = NULL;
	rw_enter(&tdp->i_ufsvfs->vfs_dqrwlock, RW_READER);
	rw_enter(&tdp->i_contents, RW_WRITER);
	err = ufs_dircheckforname(tdp, namep, namlen, &slot, &tip, cr, 0);
	if (err)
		goto out;

	if (tip) {
		switch (op) {
		case DE_RENAME:
			err = ufs_dirrename(sdp, sip, tdp, namep,
			    tip, &slot, cr);
			break;

		case DE_LINK:
		case DE_SYMLINK:
			/*
			 * Can't link to an existing file.
			 */
			err = EEXIST;
			break;
		default:
			break;
		}
	} else {
		/*
		 * The entry does not exist. Check write permission in
		 * directory to see if entry can be created.
		 */
		if (err = ufs_iaccess(tdp, IWRITE, cr))
			goto out;
		err = ufs_diraddentry(tdp, namep, op, namlen, &slot, sip, sdp,
		    cr);
	}

out:
	if (slot.fbp)
		fbrelse(slot.fbp, S_OTHER);

	rw_exit(&tdp->i_contents);

	/*
	 * Drop vfs_dqrwlock before calling VN_RELE() on tip to
	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
	 */
	rw_exit(&tdp->i_ufsvfs->vfs_dqrwlock);

	/*
	 * If we renamed a file over the top of an existing file,
	 * or linked a file to an existing file (or tried to),
	 * then set *tvpp to the target vnode, if tvpp is non-null
	 * otherwise, release and delete (or just release) the inode.
	 *
	 * N.B., by returning the target's vnode pointer to the caller,
	 * that caller becomes responsible for doing the VN_RELE.
	 */
	if (tip) {
		if ((err == 0) && (tvpp != NULL)) {
			*tvpp = ITOV(tip);
		} else {
			VN_RELE(ITOV(tip));
		}
	}

out2:
	if (err) {
		/*
		 * Undo bumped link count.
		 */
		if (op != DE_SYMLINK) {
			rw_enter(&sip->i_contents, RW_WRITER);
			sip->i_nlink--;
			ufs_setreclaim(sip);
			TRANS_INODE(sip->i_ufsvfs, sip);
			sip->i_flag |= ICHG;
			sip->i_seq++;
			ITIMES_NOLOCK(sip);
			rw_exit(&sip->i_contents);
		}
	}
	return (err);
}

/*
 * Check for the existence of a name in a directory (unless noentry
 * is set) , or else of an empty
 * slot in which an entry may be made.  If the requested name is found,
 * then on return *ipp points at the inode and *offp contains
 * its offset in the directory.  If the name is not found, then *ipp
 * will be NULL and *slotp will contain information about a directory slot in
 * which an entry may be made (either an empty slot, or the first position
 * past the end of the directory).
 * The target directory inode (tdp) is supplied write locked (i_rwlock).
 *
 * This may not be used on "." or "..", but aliases of "." are ok.
 */
int
ufs_dircheckforname(
	struct inode *tdp,	/* inode of directory being checked */
	char *namep,		/* name we're checking for */
	int namlen,		/* length of name, excluding null */
	struct ufs_slot *slotp,	/* slot structure */
	struct inode **ipp,	/* return inode if we find one */
	struct cred *cr,
	int noentry)		/* noentry - just look for space */
{
	uint64_t handle;
	struct fbuf *fbp;	/* pointer to directory block */
	struct direct *ep;	/* directory entry */
	struct direct *nep;	/* next directory entry */
	dcanchor_t *dcap;
	vnode_t *dvp;		/* directory vnode ptr */
	off_t dirsize;		/* size of the directory */
	off_t offset;		/* offset in the directory */
	off_t last_offset;	/* last offset */
	off_t enduseful;	/* pointer past last used dir slot */
	int entryoffsetinblk;	/* offset of ep in fbp's buffer */
	int i;			/* length of mangled entry */
	int needed;
	int err;
	int first;
	int caching;
	int stat;
	ino_t ep_ino;
	slotstat_t initstat = slotp->status;

	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
	ASSERT(*ipp == NULL);
	fbp = NULL;

	/*
	 * First check if there is a complete cache of the directory.
	 */
	dvp = ITOV(tdp);

	dcap = &tdp->i_danchor;
	if (noentry) {
		/*
		 * We know from the 1st level dnlc cache that the entry
		 * doesn't exist, so don't bother searching the directory
		 * cache, but just look for space (possibly in the directory
		 * cache).
		 */
		stat = DNOENT;
	} else {
		stat = dnlc_dir_lookup(dcap, namep, &handle);
	}
	switch (stat) {
	case DFOUND:
		ep_ino = (ino_t)H_TO_INO(handle);
		if (tdp->i_number == ep_ino) {
			*ipp = tdp;	/* we want ourself, ie "." */
			VN_HOLD(dvp);
		} else {
			err = ufs_iget_alloced(tdp->i_vfs, ep_ino, ipp, cr);
			if (err)
				return (err);
		}
		offset = H_TO_OFF(handle);
		first = 0;
		if (offset & 1) {
			/* This is the first entry in the block */
			first = 1;
			offset -= 1;
			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
		}
		err = blkatoff(tdp, offset, (char **)&ep, &fbp);
		if (err) {
			VN_RELE(ITOV(*ipp));
			*ipp = NULL;
			return (err);
		}
		/*
		 * Check the validity of the entry.
		 * If it's bad, then throw away the cache and
		 * continue without it. The dirmangled() routine
		 * will then be called upon it.
		 */
		if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
			VN_RELE(ITOV(*ipp));
			*ipp = NULL;
			dnlc_dir_purge(dcap);
			break;
		}
		/*
		 * Remember the returned offset is the offset of the
		 * preceding record (unless this is the 1st record
		 * in the DIRBLKSIZ sized block (disk sector)), then it's
		 * offset + 1. Note, no real offsets are on odd boundaries.
		 */
		if (first) {
			ASSERT((offset & (DIRBLKSIZ - 1)) == 0);
			slotp->offset = offset;
			slotp->size = 0;
			slotp->ep = ep;
		} else {
			/* get the next entry */
			nep = (struct direct *)((char *)ep + ep->d_reclen);
			/*
			 * Check the validity of this entry as well
			 * If it's bad, then throw away the cache and
			 * continue without it. The dirmangled() routine
			 * will then be called upon it.
			 */
			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
			    (nep->d_ino != ep_ino)) {
				VN_RELE(ITOV(*ipp));
				*ipp = NULL;
				dnlc_dir_purge(dcap);
				break;
			}
			slotp->offset = offset + ep->d_reclen;
			slotp->size = ep->d_reclen;
			slotp->ep = nep;
		}
		slotp->status = EXIST;
		slotp->fbp = fbp;
		slotp->endoff = 0;
		slotp->cached = 1;
		dnlc_update(dvp, namep, ITOV(*ipp));
		return (0);
	case DNOENT:
		/*
		 * The caller gets to set the initial slot status to
		 * indicate whether it's interested in getting a
		 * empty slot. For example, the status can be set
		 * to FOUND when an entry is being deleted.
		 */
		ASSERT(slotp->fbp == NULL);
		if (slotp->status == FOUND) {
			return (0);
		}
		switch (dnlc_dir_rem_space_by_len(dcap, LDIRSIZ(namlen),
		    &handle)) {
		case DFOUND:
			offset = (off_t)handle;
			err = blkatoff(tdp, offset, (char **)&ep, &fbp);
			if (err) {
				dnlc_dir_purge(dcap);
				ASSERT(*ipp == NULL);
				return (err);
			}
			/*
			 * Check the validity of the entry.
			 * If it's bad, then throw away the cache and
			 * continue without it. The dirmangled() routine
			 * will then be called upon it.
			 */
			if ((ep->d_reclen == 0) || (ep->d_reclen & 0x3)) {
				dnlc_dir_purge(dcap);
				break;
			}
			/*
			 * Remember the returned offset is the offset of the
			 * containing record.
			 */
			slotp->status = FOUND;
			slotp->ep = ep;
			slotp->offset = offset;
			slotp->fbp = fbp;
			slotp->size = ep->d_reclen;
			/*
			 * Set end offset to 0. Truncation is handled
			 * because the dnlc cache will blow away the
			 * cached directory when an entry is removed
			 * that drops the entries left to less than half
			 * the minumum number (dnlc_min_dir_cache).
			 */
			slotp->endoff = 0;
			slotp->cached = 1;
			return (0);
		case DNOENT:
			slotp->status = NONE;
			slotp->offset = P2ROUNDUP_TYPED(tdp->i_size,
			    DIRBLKSIZ, u_offset_t);
			slotp->size = DIRBLKSIZ;
			slotp->endoff = 0;
			slotp->cached = 1;
			return (0);
		default:
			break;
		}
		break;
	}
	slotp->cached = 0;
	caching = NULL;
	if (!noentry && tdp->i_size >= ufs_min_dir_cache) {
		/*
		 * if the directory caching disable time has expired
		 * enable caching again.
		 */
		if (tdp->i_cachedir == CD_DISABLED_NOMEM &&
		    gethrtime() - ufs_dc_disable_at > ufs_dc_disable_duration) {
			ufs_dc_disable_at = 0;
			tdp->i_cachedir = CD_ENABLED;
		}
		/*
		 * Attempt to cache any directories greater than the tunable
		 * ufs_min_cache_dir. If it fails due to memory shortage
		 * (DNOMEM), disable caching for this directory and record
		 * the system time. Any attempt after the disable time has
		 * expired will enable the caching again.
		 */
		if (tdp->i_cachedir == CD_ENABLED) {
			switch (dnlc_dir_start(dcap,
			    tdp->i_size >> AV_DIRECT_SHIFT)) {
			case DNOMEM:
				tdp->i_cachedir = CD_DISABLED_NOMEM;
				ufs_dc_disable_at = gethrtime();
				break;
			case DTOOBIG:
				tdp->i_cachedir = CD_DISABLED_TOOBIG;
				break;
			case DOK:
				caching = 1;
				break;
			default:
				break;
			}
		}
	}

	/*
	 * No point in using i_diroff since we must search whole directory
	 */
	dirsize = P2ROUNDUP_TYPED(tdp->i_size, DIRBLKSIZ, u_offset_t);
	enduseful = 0;
	offset = last_offset = 0;
	entryoffsetinblk = 0;
	needed = (int)LDIRSIZ(namlen);
	while (offset < dirsize) {
		/*
		 * If offset is on a block boundary,
		 * read the next directory block.
		 * Release previous if it exists.
		 */
		if (blkoff(tdp->i_fs, offset) == 0) {
			if (fbp != NULL)
				fbrelse(fbp, S_OTHER);

			err = blkatoff(tdp, offset, (char **)0, &fbp);
			if (err) {
				ASSERT(*ipp == NULL);
				if (caching) {
					dnlc_dir_purge(dcap);
				}
				return (err);
			}
			entryoffsetinblk = 0;
		}
		/*
		 * If still looking for a slot, and at a DIRBLKSIZ
		 * boundary, have to start looking for free space
		 * again.
		 */
		if (slotp->status == NONE &&
		    (entryoffsetinblk & (DIRBLKSIZ - 1)) == 0) {
			slotp->offset = -1;
		}
		/*
		 * If the next entry is a zero length record or if the
		 * record length is invalid, then skip to the next
		 * directory block.  Complete validation checks are
		 * done if the record length is invalid.
		 *
		 * Full validation checks are slow so they are disabled
		 * by default.  Complete checks can be run by patching
		 * "dirchk" to be true.
		 *
		 * We do not have to check the validity of
		 * entryoffsetinblk here because it starts out as zero
		 * and is only incremented by d_reclen values that we
		 * validate here.
		 */
		ep = (struct direct *)(fbp->fb_addr + entryoffsetinblk);
		if (ep->d_reclen == 0 ||
		    (dirchk || (ep->d_reclen & 0x3)) &&
		    dirmangled(tdp, ep, entryoffsetinblk, offset)) {
			i = DIRBLKSIZ - (entryoffsetinblk & (DIRBLKSIZ - 1));
			offset += i;
			entryoffsetinblk += i;
			if (caching) {
				dnlc_dir_purge(dcap);
				caching = 0;
			}
			continue;
		}

		/*
		 * Add named entries and free space into the directory cache
		 */
		if (caching) {
			ushort_t extra;
			off_t off2;

			if (ep->d_ino == 0) {
				extra = ep->d_reclen;
				if (offset & (DIRBLKSIZ - 1)) {
					dnlc_dir_purge(dcap);
					caching = 0;
				}
			} else {
				/*
				 * entries hold the previous offset if
				 * not the 1st one
				 */
				if (offset & (DIRBLKSIZ - 1)) {
					off2 = last_offset;
				} else {
					off2 = offset + 1;
				}
				caching = (dnlc_dir_add_entry(dcap, ep->d_name,
				    INO_OFF_TO_H(ep->d_ino, off2)) == DOK);
				extra = ep->d_reclen - DIRSIZ(ep);
			}
			if (caching && (extra >= LDIRSIZ(1))) {
				caching = (dnlc_dir_add_space(dcap, extra,
				    (uint64_t)offset) == DOK);
			}
		}

		/*
		 * If an appropriate sized slot has not yet been found,
		 * check to see if one is available.
		 */
		if ((slotp->status != FOUND) && (slotp->status != EXIST)) {
			int size = ep->d_reclen;

			if (ep->d_ino != 0)
				size -= DIRSIZ(ep);
			if (size > 0) {
				if (size >= needed) {
					slotp->offset = offset;
					slotp->size = ep->d_reclen;
					if (noentry) {
						slotp->ep = ep;
						slotp->fbp = fbp;
						slotp->status = FOUND;
						slotp->endoff = 0;
						return (0);
					}
					slotp->status = FOUND;
				} else if (slotp->status == NONE) {
					if (slotp->offset == -1)
						slotp->offset = offset;
				}
			}
		}
		/*
		 * Check for a name match.
		 */
		if (ep->d_ino && ep->d_namlen == namlen &&
		    *namep == *ep->d_name &&	/* fast chk 1st char */
		    bcmp(namep, ep->d_name, namlen) == 0) {

			tdp->i_diroff = offset;

			if (tdp->i_number == ep->d_ino) {
				*ipp = tdp;	/* we want ourself, ie "." */
				VN_HOLD(dvp);
			} else {
				err = ufs_iget_alloced(tdp->i_vfs,
				    (ino_t)ep->d_ino, ipp, cr);
				if (err) {
					fbrelse(fbp, S_OTHER);
					if (caching)
						dnlc_dir_purge(dcap);
					return (err);
				}
			}
			slotp->status = EXIST;
			slotp->offset = offset;
			slotp->size = (int)(offset - last_offset);
			slotp->fbp = fbp;
			slotp->ep = ep;
			slotp->endoff = 0;
			if (caching)
				dnlc_dir_purge(dcap);
			return (0);
		}
		last_offset = offset;
		offset += ep->d_reclen;
		entryoffsetinblk += ep->d_reclen;
		if (ep->d_ino)
			enduseful = offset;
	}
	if (fbp) {
		fbrelse(fbp, S_OTHER);
	}

	if (caching) {
		dnlc_dir_complete(dcap);
		slotp->cached = 1;
		if (slotp->status == FOUND) {
			if (initstat == FOUND) {
				return (0);
			}
			(void) dnlc_dir_rem_space_by_handle(dcap,
			    slotp->offset);
			slotp->endoff = 0;
			return (0);
		}
	}

	if (slotp->status == NONE) {
		/*
		 * We didn't find a slot; the new directory entry should be put
		 * at the end of the directory.  Return an indication of where
		 * this is, and set "endoff" to zero; since we're going to have
		 * to extend the directory, we're certainly not going to
		 * truncate it.
		 */
		slotp->offset = dirsize;
		slotp->size = DIRBLKSIZ;
		slotp->endoff = 0;
	} else {
		/*
		 * We found a slot, and will return an indication of where that
		 * slot is, as any new directory entry will be put there.
		 * Since that slot will become a useful entry, if the last
		 * useful entry we found was before this one, update the offset
		 * of the last useful entry.
		 */
		if (enduseful < slotp->offset + slotp->size)
			enduseful = slotp->offset + slotp->size;
		slotp->endoff = P2ROUNDUP_TYPED(enduseful, DIRBLKSIZ, off_t);
	}
	*ipp = NULL;
	return (0);
}

uint64_t ufs_dirrename_retry_cnt;

/*
 * Rename the entry in the directory tdp so that it points to
 * sip instead of tip.
 */
static int
ufs_dirrename(
	struct inode *sdp,	/* parent directory of source */
	struct inode *sip,	/* source inode */
	struct inode *tdp,	/* parent directory of target */
	char *namep,		/* entry we are trying to change */
	struct inode *tip,	/* target inode */
	struct ufs_slot *slotp,	/* slot for entry */
	struct cred *cr)	/* credentials */
{
	vnode_t *tdvp;
	off_t offset;
	int err;
	int doingdirectory;

	ASSERT(sdp->i_ufsvfs != NULL);
	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
	/*
	 * Short circuit rename of something to itself.
	 */
	if (sip->i_number == tip->i_number) {
		return (ESAME); /* special KLUDGE error code */
	}

	/*
	 * We're locking 2 peer level locks, so must use tryenter
	 * on the 2nd to avoid deadlocks that would occur
	 * if we renamed a->b and b->a concurrently.
	 */
retry:
	rw_enter(&tip->i_contents, RW_WRITER);
	if (!rw_tryenter(&sip->i_contents, RW_READER)) {
		/*
		 * drop tip and wait (sleep) until we stand a chance
		 * of holding sip
		 */
		rw_exit(&tip->i_contents);
		rw_enter(&sip->i_contents, RW_READER);
		/*
		 * Reverse the lock grabs in case we have heavy
		 * contention on the 2nd lock.
		 */
		if (!rw_tryenter(&tip->i_contents, RW_WRITER)) {
			ufs_dirrename_retry_cnt++;
			rw_exit(&sip->i_contents);
			goto retry;
		}
	}

	/*
	 * Check that everything is on the same filesystem.
	 */
	if ((ITOV(tip)->v_vfsp != ITOV(tdp)->v_vfsp) ||
	    (ITOV(tip)->v_vfsp != ITOV(sip)->v_vfsp)) {
		err = EXDEV;		/* XXX archaic */
		goto out;
	}
	/*
	 * Must have write permission to rewrite target entry.
	 * Perform additional checks for sticky directories.
	 */
	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0 ||
	    (err = ufs_sticky_remove_access(tdp, tip, cr)) != 0)
		goto out;

	/*
	 * Ensure source and target are compatible (both directories
	 * or both not directories).  If target is a directory it must
	 * be empty and have no links to it; in addition it must not
	 * be a mount point, and both the source and target must be
	 * writable.
	 */
	doingdirectory = (((sip->i_mode & IFMT) == IFDIR) ||
	    ((sip->i_mode & IFMT) == IFATTRDIR));
	if (((tip->i_mode & IFMT) == IFDIR) ||
	    ((tip->i_mode & IFMT) == IFATTRDIR)) {
		if (!doingdirectory) {
			err = EISDIR;
			goto out;
		}
		/*
		 * vn_vfsrlock will prevent mounts from using the directory
		 * until we are done.
		 */
		if (vn_vfsrlock(ITOV(tip))) {
			err = EBUSY;
			goto out;
		}
		if (vn_mountedvfs(ITOV(tip)) != NULL) {
			vn_vfsunlock(ITOV(tip));
			err = EBUSY;
			goto out;
		}
		if (!ufs_dirempty(tip, tdp->i_number, cr) || tip->i_nlink > 2) {
			vn_vfsunlock(ITOV(tip));
			err = EEXIST;	/* SIGH should be ENOTEMPTY */
			goto out;
		}
	} else if (doingdirectory) {
		err = ENOTDIR;
		goto out;
	}

	/*
	 * Rewrite the inode pointer for target name entry
	 * from the target inode (ip) to the source inode (sip).
	 * This prevents the target entry from disappearing
	 * during a crash. Mark the directory inode to reflect the changes.
	 */
	tdvp = ITOV(tdp);
	slotp->ep->d_ino = (int32_t)sip->i_number;
	dnlc_update(tdvp, namep, ITOV(sip));
	if (slotp->size) {
		offset = slotp->offset - slotp->size;
	} else {
		offset = slotp->offset + 1;
	}
	if (slotp->cached) {
		(void) dnlc_dir_update(&tdp->i_danchor, namep,
		    INO_OFF_TO_H(slotp->ep->d_ino, offset));
	}

	err = TRANS_DIR(tdp, slotp->offset);
	if (err)
		fbrelse(slotp->fbp, S_OTHER);
	else
		err = ufs_fbwrite(slotp->fbp, tdp);

	slotp->fbp = NULL;
	if (err) {
		if (doingdirectory)
			vn_vfsunlock(ITOV(tip));
		goto out;
	}

	TRANS_INODE(tdp->i_ufsvfs, tdp);
	tdp->i_flag |= IUPD|ICHG;
	tdp->i_seq++;
	ITIMES_NOLOCK(tdp);

	/*
	 * Decrement the link count of the target inode.
	 * Fix the ".." entry in sip to point to dp.
	 * This is done after the new entry is on the disk.
	 */
	tip->i_nlink--;
	TRANS_INODE(tip->i_ufsvfs, tip);
	tip->i_flag |= ICHG;
	tip->i_seq++;
	ITIMES_NOLOCK(tip);
	if (doingdirectory) {
		/*
		 * The entry for tip no longer exists so I can unlock the
		 * vfslock.
		 */
		vn_vfsunlock(ITOV(tip));
		/*
		 * Decrement target link count once more if it was a directory.
		 */
		if (--tip->i_nlink != 0) {
			err = ufs_fault(ITOV(tip),
		    "ufs_dirrename: target directory link count != 0 (%s)",
			    tip->i_fs->fs_fsmnt);
			rw_exit(&tip->i_contents);
			return (err);
		}
		TRANS_INODE(tip->i_ufsvfs, tip);
		ufs_setreclaim(tip);
		/*
		 * Renaming a directory with the parent different
		 * requires that ".." be rewritten.  The window is
		 * still there for ".." to be inconsistent, but this
		 * is unavoidable, and a lot shorter than when it was
		 * done in a user process.  We decrement the link
		 * count in the new parent as appropriate to reflect
		 * the just-removed target.  If the parent is the
		 * same, this is appropriate since the original
		 * directory is going away.  If the new parent is
		 * different, ufs_dirfixdotdot() will bump the link count
		 * back.
		 */
		tdp->i_nlink--;
		ufs_setreclaim(tdp);
		TRANS_INODE(tdp->i_ufsvfs, tdp);
		tdp->i_flag |= ICHG;
		tdp->i_seq++;
		ITIMES_NOLOCK(tdp);
		if (sdp != tdp) {
			rw_exit(&tip->i_contents);
			rw_exit(&sip->i_contents);
			err = ufs_dirfixdotdot(sip, sdp, tdp);
			return (err);
		}
	} else
		ufs_setreclaim(tip);
out:
	rw_exit(&tip->i_contents);
	rw_exit(&sip->i_contents);
	return (err);
}

/*
 * Fix the ".." entry of the child directory so that it points
 * to the new parent directory instead of the old one.  Routine
 * assumes that dp is a directory and that all the inodes are on
 * the same file system.
 */
static int
ufs_dirfixdotdot(
	struct inode *dp,	/* child directory */
	struct inode *opdp,	/* old parent directory */
	struct inode *npdp)	/* new parent directory */
{
	struct fbuf *fbp;
	struct dirtemplate *dirp;
	vnode_t *dvp;
	int err;

	ASSERT(RW_WRITE_HELD(&npdp->i_rwlock));
	ASSERT(RW_WRITE_HELD(&npdp->i_contents));

	/*
	 * We hold the child directory's i_contents lock before calling
	 * blkatoff so that we honor correct locking protocol which is
	 * i_contents lock and then page lock. (blkatoff will call
	 * ufs_getpage where we want the page lock)
	 * We hold the child directory's i_rwlock before i_contents (as
	 * per the locking protocol) since we are modifying the ".." entry
	 * of the child directory.
	 * We hold the i_rwlock and i_contents lock until we record
	 * this directory delta to the log (via ufs_trans_dir) and have
	 * done fbrelse.
	 */
	rw_enter(&dp->i_rwlock, RW_WRITER);
	rw_enter(&dp->i_contents, RW_WRITER);
	err = blkatoff(dp, (off_t)0, (char **)&dirp, &fbp);
	if (err)
		goto bad;

	if (dp->i_nlink <= 0 ||
	    dp->i_size < sizeof (struct dirtemplate)) {
		err = ENOENT;
		goto bad;
	}

	if (dirp->dotdot_namlen != 2 ||
	    dirp->dotdot_name[0] != '.' ||
	    dirp->dotdot_name[1] != '.') {	/* Sanity check. */
		dirbad(dp, "mangled .. entry", (off_t)0);
		err = ENOTDIR;
		goto bad;
	}

	/*
	 * Increment the link count in the new parent inode and force it out.
	 */
	if (npdp->i_nlink == MAXLINK) {
		err = EMLINK;
		goto bad;
	}
	npdp->i_nlink++;
	TRANS_INODE(npdp->i_ufsvfs, npdp);
	npdp->i_flag |= ICHG;
	npdp->i_seq++;
	ufs_iupdat(npdp, I_SYNC);

	/*
	 * Rewrite the child ".." entry and force it out.
	 */
	dvp = ITOV(dp);
	dirp->dotdot_ino = (uint32_t)npdp->i_number;
	dnlc_update(dvp, "..", ITOV(npdp));
	(void) dnlc_dir_update(&dp->i_danchor, "..",
	    INO_OFF_TO_H(dirp->dotdot_ino, 0));

	err = TRANS_DIR(dp, 0);
	if (err)
		fbrelse(fbp, S_OTHER);
	else
		err = ufs_fbwrite(fbp, dp);

	fbp = NULL;
	if (err)
		goto bad;

	rw_exit(&dp->i_contents);
	rw_exit(&dp->i_rwlock);

	/*
	 * Decrement the link count of the old parent inode and force it out.
	 */
	ASSERT(opdp);
	rw_enter(&opdp->i_contents, RW_WRITER);
	ASSERT(opdp->i_nlink > 0);
	opdp->i_nlink--;
	ufs_setreclaim(opdp);
	TRANS_INODE(opdp->i_ufsvfs, opdp);
	opdp->i_flag |= ICHG;
	opdp->i_seq++;
	ufs_iupdat(opdp, I_SYNC);
	rw_exit(&opdp->i_contents);
	return (0);

bad:
	if (fbp)
		fbrelse(fbp, S_OTHER);
	rw_exit(&dp->i_contents);
	rw_exit(&dp->i_rwlock);
	return (err);
}

/*
 * Enter the file sip in the directory tdp with name namep.
 */
static int
ufs_diraddentry(
	struct inode *tdp,
	char *namep,
	enum de_op op,
	int namlen,
	struct ufs_slot *slotp,
	struct inode *sip,
	struct inode *sdp,
	struct cred *cr)
{
	struct direct *ep, *nep;
	vnode_t *tdvp;
	dcanchor_t *dcap = &tdp->i_danchor;
	off_t offset;
	int err;
	ushort_t extra;

	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
	/*
	 * Prepare a new entry.  If the caller has not supplied an
	 * existing inode, make a new one.
	 */
	err = dirprepareentry(tdp, slotp, cr);
	if (err) {
		if (slotp->fbp) {
			fbrelse(slotp->fbp, S_OTHER);
			slotp->fbp = NULL;
		}
		return (err);
	}
	/*
	 * Check inode to be linked to see if it is in the
	 * same filesystem.
	 */
	if (ITOV(tdp)->v_vfsp != ITOV(sip)->v_vfsp) {
		err = EXDEV;
		goto bad;
	}

	/*
	 * If renaming a directory then fix up the ".." entry in the
	 * directory to point to the new parent.
	 */
	if ((op == DE_RENAME) && (((sip->i_mode & IFMT) == IFDIR) ||
	    ((sip->i_mode & IFMT) == IFATTRDIR)) && (sdp != tdp)) {
		err = ufs_dirfixdotdot(sip, sdp, tdp);
		if (err)
			goto bad;
	}

	/*
	 * Fill in entry data.
	 */
	ep = slotp->ep;
	ep->d_namlen = (ushort_t)namlen;
	(void) strncpy(ep->d_name, namep, (size_t)((namlen + 4) & ~3));
	ep->d_ino = (uint32_t)sip->i_number;
	tdvp = ITOV(tdp);
	dnlc_update(tdvp, namep, ITOV(sip));
	/*
	 * Note the offset supplied for any named entry is
	 * the offset of the previous one, unless it's the 1st.
	 * slotp->size is used to pass the length to
	 * the previous entry.
	 */
	if (slotp->size) {
		offset = slotp->offset - slotp->size;
	} else {
		offset = slotp->offset + 1;
	}

	if (slotp->cached) {
		/*
		 * Add back any usable unused space to the dnlc directory
		 * cache.
		 */
		extra = ep->d_reclen - DIRSIZ(ep);
		if (extra >= LDIRSIZ(1)) {
			(void) dnlc_dir_add_space(dcap, extra,
			    (uint64_t)slotp->offset);
		}

		(void) dnlc_dir_add_entry(dcap, namep,
		    INO_OFF_TO_H(ep->d_ino, offset));

		/* adjust the previous offset of the next entry */
		nep = (struct direct *)((char *)ep + ep->d_reclen);
		if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
			/*
			 * Not a new block.
			 *
			 * Check the validity of the next entry.
			 * If it's bad, then throw away the cache, and
			 * continue as before directory caching.
			 */
			if ((nep->d_reclen == 0) || (nep->d_reclen & 0x3) ||
			    dnlc_dir_update(dcap, nep->d_name,
			    INO_OFF_TO_H(nep->d_ino, slotp->offset))
			    == DNOENT) {
				dnlc_dir_purge(dcap);
				slotp->cached = 0;
			}
		}
	}

	/*
	 * Write out the directory block.
	 */
	err = TRANS_DIR(tdp, slotp->offset);
	if (err)
		fbrelse(slotp->fbp, S_OTHER);
	else
		err = ufs_fbwrite(slotp->fbp, tdp);

	slotp->fbp = NULL;
	/*
	 * If this is a rename of a directory, then we have already
	 * fixed the ".." entry to refer to the new parent. If err
	 * is true at this point, we have failed to update the new
	 * parent to refer to the renamed directory.
	 * XXX - we need to unwind the ".." fix.
	 */
	if (err)
		return (err);

	/*
	 * Mark the directory inode to reflect the changes.
	 * Truncate the directory to chop off blocks of empty entries.
	 */

	TRANS_INODE(tdp->i_ufsvfs, tdp);
	tdp->i_flag |= IUPD|ICHG;
	tdp->i_seq++;
	tdp->i_diroff = 0;
	ITIMES_NOLOCK(tdp);
	/*
	 * If the directory grew then dirprepareentry() will have
	 * set IATTCHG in tdp->i_flag, then the directory inode must
	 * be flushed out. This is because if fsync() is used later
	 * the directory size must be correct, otherwise a crash would
	 * cause fsck to move the file to lost+found. Also because later
	 * a file may be linked in more than one directory, then there
	 * is no way to flush the original directory. So it must be
	 * flushed out on creation. See bug 4293809.
	 */
	if (tdp->i_flag & IATTCHG) {
		ufs_iupdat(tdp, I_SYNC);
	}

	if (slotp->endoff && (slotp->endoff < tdp->i_size)) {
		if (!TRANS_ISTRANS(tdp->i_ufsvfs)) {
			(void) ufs_itrunc(tdp, (u_offset_t)slotp->endoff, 0,
						cr);
		}
	}


	return (0);

bad:
	if (slotp->cached) {
		dnlc_dir_purge(dcap);
		fbrelse(slotp->fbp, S_OTHER);
		slotp->cached = 0;
		slotp->fbp = NULL;
		return (err);
	}

	/*
	 * Clear out entry prepared by dirprepareent.
	 */
	slotp->ep->d_ino = 0;
	slotp->ep->d_namlen = 0;

	/*
	 * Don't touch err so we don't clobber the real error that got us here.
	 */
	if (TRANS_DIR(tdp, slotp->offset))
		fbrelse(slotp->fbp, S_OTHER);
	else
		(void) ufs_fbwrite(slotp->fbp, tdp);
	slotp->fbp = NULL;
	return (err);
}

/*
 * Prepare a directory slot to receive an entry.
 */
static int
dirprepareentry(
	struct inode *dp,	/* directory we are working in */
	struct ufs_slot *slotp,	/* available slot info */
	struct cred *cr)
{
	struct direct *ep, *nep;
	off_t entryend;
	int err;
	slotstat_t status = slotp->status;
	ushort_t dsize;

	ASSERT((status == NONE) || (status == FOUND));
	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
	ASSERT(RW_WRITE_HELD(&dp->i_contents));
	/*
	 * If we didn't find a slot, then indicate that the
	 * new slot belongs at the end of the directory.
	 * If we found a slot, then the new entry can be
	 * put at slotp->offset.
	 */
	entryend = slotp->offset + slotp->size;
	if (status == NONE) {
		ASSERT((slotp->offset & (DIRBLKSIZ - 1)) == 0);
		if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
			err = ufs_fault(ITOV(dp),
			    "dirprepareentry: bad fs_fsize, DIRBLKSIZ: %d"
			    " > dp->i_fs->fs_fsize: %d (%s)",
			    DIRBLKSIZ, dp->i_fs->fs_fsize, dp->i_fs->fs_fsmnt);
			return (err);
		}
		/*
		 * Allocate the new block.
		 */
		err = BMAPALLOC(dp, (u_offset_t)slotp->offset,
		    (int)(blkoff(dp->i_fs, slotp->offset) + DIRBLKSIZ), cr);
		if (err) {
			return (err);
		}
		dp->i_size = entryend;
		TRANS_INODE(dp->i_ufsvfs, dp);
		dp->i_flag |= IUPD|ICHG|IATTCHG;
		dp->i_seq++;
		ITIMES_NOLOCK(dp);
	} else if (entryend > dp->i_size) {
		/*
		 * Adjust directory size, if needed. This should never
		 * push the size past a new multiple of DIRBLKSIZ.
		 * This is an artifact of the old (4.2BSD) way of initializing
		 * directory sizes to be less than DIRBLKSIZ.
		 */
		dp->i_size = P2ROUNDUP_TYPED(entryend, DIRBLKSIZ, off_t);
		TRANS_INODE(dp->i_ufsvfs, dp);
		dp->i_flag |= IUPD|ICHG|IATTCHG;
		dp->i_seq++;
		ITIMES_NOLOCK(dp);
	}

	/*
	 * Get the block containing the space for the new directory entry.
	 */
	if (slotp->fbp == NULL) {
		err = blkatoff(dp, slotp->offset, (char **)&slotp->ep,
		    &slotp->fbp);
		if (err) {
			return (err);
		}
	}
	ep = slotp->ep;

	switch (status) {
	case NONE:
		/*
		 * No space in the directory. slotp->offset will be on a
		 * directory block boundary and we will write the new entry
		 * into a fresh block.
		 */
		ep->d_reclen = DIRBLKSIZ;
		slotp->size = 0; /* length of previous entry */
		break;
	case FOUND:
		/*
		 * An entry of the required size has been found. Use it.
		 */
		if (ep->d_ino == 0) {
			/* this is the 1st record in a block */
			slotp->size = 0; /* length of previous entry */
		} else {
			dsize = DIRSIZ(ep);
			nep = (struct direct *)((char *)ep + dsize);
			nep->d_reclen = ep->d_reclen - dsize;
			ep->d_reclen = dsize;
			slotp->ep = nep;
			slotp->offset += dsize;
			slotp->size = dsize; /* length of previous entry */
		}
		break;
	default:
		break;
	}
	return (0);
}

/*
 * Allocate and initialize a new inode that will go into directory tdp.
 * This routine is called from ufs_symlink(), as well as within this file.
 */
int
ufs_dirmakeinode(
	struct inode *tdp,
	struct inode **ipp,
	struct vattr *vap,
	enum de_op op,
	struct cred *cr)
{
	struct inode *ip;
	enum vtype type;
	int imode;			/* mode and format as in inode */
	ino_t ipref;
	int err;
	timestruc_t now;

	ASSERT(vap != NULL);
	ASSERT(op == DE_CREATE || op == DE_MKDIR || op == DE_ATTRDIR ||
		op == DE_SYMLINK);
	ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
	ASSERT(RW_WRITE_HELD(&tdp->i_rwlock));
	ASSERT(RW_WRITE_HELD(&tdp->i_contents));
	/*
	 * Allocate a new inode.
	 */
	type = vap->va_type;
	if (type == VDIR) {
		ipref = dirpref(tdp);
	} else {
		ipref = tdp->i_number;
	}
	if (op == DE_ATTRDIR)
		imode = vap->va_mode;
	else
		imode = MAKEIMODE(type, vap->va_mode);
	*ipp = NULL;
	err = ufs_ialloc(tdp, ipref, imode, &ip, cr);
	if (err)
		return (err);

	/*
	 * We don't need to grab vfs_dqrwlock here because it is held
	 * in ufs_direnter_*() above us.
	 */
	ASSERT(RW_READ_HELD(&ip->i_ufsvfs->vfs_dqrwlock));
	rw_enter(&ip->i_contents, RW_WRITER);
	if (ip->i_dquot != NULL) {
		err = ufs_fault(ITOV(ip),
		    "ufs_dirmakeinode, ip->i_dquot != NULL: dquot (%s)",
				    tdp->i_fs->fs_fsmnt);
		rw_exit(&ip->i_contents);
		return (err);
	}
	*ipp = ip;
	ip->i_mode = (o_mode_t)imode;
	if (type == VBLK || type == VCHR) {
		dev_t d = vap->va_rdev;
		dev32_t dev32;

		/*
		 * Don't allow a special file to be created with a
		 * dev_t that cannot be represented by this filesystem
		 * format on disk.
		 */
		if (!cmpldev(&dev32, d)) {
			err = EOVERFLOW;
			goto fail;
		}

		ITOV(ip)->v_rdev = ip->i_rdev = d;

		if (dev32 & ~((O_MAXMAJ << L_BITSMINOR32) | O_MAXMIN)) {
			ip->i_ordev = dev32; /* can't use old format */
		} else {
			ip->i_ordev = cmpdev(d);
		}
	}
	ITOV(ip)->v_type = type;
	ufs_reset_vnode(ip->i_vnode);
	if (type == VDIR) {
		ip->i_nlink = 2; /* anticipating a call to dirmakedirect */
	} else {
		ip->i_nlink = 1;
	}

	if (op == DE_ATTRDIR) {
		ip->i_uid = vap->va_uid;
		ip->i_gid = vap->va_gid;
	} else
		ip->i_uid = crgetuid(cr);
	/*
	 * To determine the group-id of the created file:
	 *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
	 *	clients are not likely to set the gid), then use it if
	 *	the process is privileged, belongs to the target group,
	 *	or the group is the same as the parent directory.
	 *   2) If the filesystem was not mounted with the Old-BSD-compatible
	 *	GRPID option, and the directory's set-gid bit is clear,
	 *	then use the process's gid.
	 *   3) Otherwise, set the group-id to the gid of the parent directory.
	 */
	if (op != DE_ATTRDIR && (vap->va_mask & AT_GID) &&
	    ((vap->va_gid == tdp->i_gid) || groupmember(vap->va_gid, cr) ||
	    secpolicy_vnode_create_gid(cr) == 0)) {
		/*
		 * XXX - is this only the case when a 4.0 NFS client, or a
		 * client derived from that code, makes a call over the wire?
		 */
		ip->i_gid = vap->va_gid;
	} else
		ip->i_gid = (tdp->i_mode & ISGID) ? tdp->i_gid : crgetgid(cr);

	/*
	 * For SunOS 5.0->5.4, the lines below read:
	 *
	 * ip->i_suid = (ip->i_uid > MAXUID) ? UID_LONG : ip->i_uid;
	 * ip->i_sgid = (ip->i_gid > MAXUID) ? GID_LONG : ip->i_gid;
	 *
	 * where MAXUID was set to 60002.  See notes on this in ufs_inode.c
	 */
	ip->i_suid = (ulong_t)ip->i_uid > (ulong_t)USHRT_MAX ?
		UID_LONG : ip->i_uid;
	ip->i_sgid = (ulong_t)ip->i_gid > (ulong_t)USHRT_MAX ?
		GID_LONG : ip->i_gid;

	/*
	 * If we're creating a directory, and the parent directory has the
	 * set-GID bit set, set it on the new directory.
	 * Otherwise, if the user is neither privileged nor a member of the
	 * file's new group, clear the file's set-GID bit.
	 */
	if ((tdp->i_mode & ISGID) && (type == VDIR))
		ip->i_mode |= ISGID;
	else {
		if ((ip->i_mode & ISGID) &&
		    secpolicy_vnode_setids_setgids(cr, ip->i_gid) != 0)
			ip->i_mode &= ~ISGID;
	}

	if (((vap->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
	    ((vap->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
		err = EOVERFLOW;
		goto fail;
	}

	/*
	 * Extended attribute directories are not subject to quotas.
	 */
	if (op != DE_ATTRDIR)
		ip->i_dquot = getinoquota(ip);
	else
		ip->i_dquot = NULL;

	if (op == DE_MKDIR || op == DE_ATTRDIR) {
		err = ufs_dirmakedirect(ip, tdp, (op == DE_MKDIR) ? 0 : 1, cr);
		if (err)
			goto fail;
	}

	/*
	 * generate the shadow inode and attach it to the new object
	 */
	ASSERT((tdp->i_shadow && tdp->i_ufs_acl) ||
	    (!tdp->i_shadow && !tdp->i_ufs_acl));
	if (tdp->i_shadow && tdp->i_ufs_acl &&
	    (((tdp->i_mode & IFMT) == IFDIR) ||
	    ((tdp->i_mode & IFMT) == IFATTRDIR))) {
		err = ufs_si_inherit(ip, tdp, ip->i_mode, cr);
		if (err) {
			if (op == DE_MKDIR) {
				/*
				 * clean up parent directory
				 *
				 * tdp->i_contents already locked from
				 * ufs_direnter_*()
				 */
				tdp->i_nlink--;
				TRANS_INODE(tdp->i_ufsvfs, tdp);
				tdp->i_flag |= ICHG;
				tdp->i_seq++;
				ufs_iupdat(tdp, I_SYNC);
			}
			goto fail;
		}
	}

	/*
	 * If the passed in attributes contain atime and/or mtime
	 * settings, then use them instead of using the current
	 * high resolution time.
	 */
	if (vap->va_mask & (AT_MTIME|AT_ATIME)) {
		if (vap->va_mask & AT_ATIME) {
			ip->i_atime.tv_sec = vap->va_atime.tv_sec;
			ip->i_atime.tv_usec = vap->va_atime.tv_nsec / 1000;
			ip->i_flag &= ~IACC;
		} else
			ip->i_flag |= IACC;
		if (vap->va_mask & AT_MTIME) {
			ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
			ip->i_mtime.tv_usec = vap->va_mtime.tv_nsec / 1000;
			gethrestime(&now);
			if (now.tv_sec > TIME32_MAX) {
				/*
				 * In 2038, ctime sticks forever..
				 */
				ip->i_ctime.tv_sec = TIME32_MAX;
				ip->i_ctime.tv_usec = 0;
			} else {
				ip->i_ctime.tv_sec = now.tv_sec;
				ip->i_ctime.tv_usec = now.tv_nsec / 1000;
			}
			ip->i_flag &= ~(IUPD|ICHG);
			ip->i_flag |= IMODTIME;
		} else
			ip->i_flag |= IUPD|ICHG;
		ip->i_flag |= IMOD;
	} else
		ip->i_flag |= IACC|IUPD|ICHG;
	ip->i_seq++;

	/*
	 * If this is an attribute tag it as one.
	 */
	if ((tdp->i_mode & IFMT) == IFATTRDIR) {
		ip->i_cflags |= IXATTR;
	}

	/*
	 * push inode before it's name appears in a directory
	 */
	TRANS_INODE(ip->i_ufsvfs, ip);
	ufs_iupdat(ip, I_SYNC);
	rw_exit(&ip->i_contents);
	return (0);

fail:
	/* Throw away inode we just allocated. */
	ip->i_nlink = 0;
	ufs_setreclaim(ip);
	TRANS_INODE(ip->i_ufsvfs, ip);
	ip->i_flag |= ICHG;
	ip->i_seq++;
	ITIMES_NOLOCK(ip);
	rw_exit(&ip->i_contents);
	return (err);
}

/*
 * Write a prototype directory into the empty inode ip, whose parent is dp.
 */
static int
ufs_dirmakedirect(
	struct inode *ip,		/* new directory */
	struct inode *dp,		/* parent directory */
	int	attrdir,
	struct cred *cr)
{
	struct dirtemplate *dirp;
	struct fbuf *fbp;
	int err;

	ASSERT(RW_WRITE_HELD(&ip->i_contents));
	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
	ASSERT(RW_WRITE_HELD(&dp->i_contents));
	/*
	 * Allocate space for the directory we're creating.
	 */
	err = BMAPALLOC(ip, (u_offset_t)0, DIRBLKSIZ, cr);
	if (err)
		return (err);
	if (DIRBLKSIZ > dp->i_fs->fs_fsize) {
		err = ufs_fault(ITOV(dp),
"ufs_dirmakedirect: bad fs_fsize, DIRBLKSIZ: %d > dp->i_fs->fs_fsize: %d (%s)",
					DIRBLKSIZ, dp->i_fs->fs_fsize,
					dp->i_fs->fs_fsmnt);
		return (err);
	}
	ip->i_size = DIRBLKSIZ;
	TRANS_INODE(ip->i_ufsvfs, ip);
	ip->i_flag |= IUPD|ICHG|IATTCHG;
	ip->i_seq++;
	ITIMES_NOLOCK(ip);
	/*
	 * Update the tdp link count and write out the change.
	 * This reflects the ".." entry we'll soon write.
	 */
	if (dp->i_nlink == MAXLINK)
		return (EMLINK);
	if (attrdir == 0)
		dp->i_nlink++;
	TRANS_INODE(dp->i_ufsvfs, dp);
	dp->i_flag |= ICHG;
	dp->i_seq++;
	ufs_iupdat(dp, I_SYNC);
	/*
	 * Initialize directory with "."
	 * and ".." from static template.
	 *
	 * Since the parent directory is locked, we don't have to
	 * worry about anything changing when we drop the write
	 * lock on (ip).
	 *
	 */
	err = fbread(ITOV(ip), (offset_t)0, (uint_t)ip->i_fs->fs_fsize,
	    S_READ, &fbp);

	if (err) {
		goto fail;
	}
	dirp = (struct dirtemplate *)fbp->fb_addr;
	/*
	 * Now initialize the directory we're creating
	 * with the "." and ".." entries.
	 */
	*dirp = mastertemplate;			/* structure assignment */
	dirp->dot_ino = (uint32_t)ip->i_number;
	dirp->dotdot_ino = (uint32_t)dp->i_number;

	err = TRANS_DIR(ip, 0);
	if (err) {
		fbrelse(fbp, S_OTHER);
		goto fail;
	}

	err = ufs_fbwrite(fbp, ip);
	if (err) {
		goto fail;
	}

	return (0);

fail:
	if (attrdir == 0)
		dp->i_nlink--;
	TRANS_INODE(dp->i_ufsvfs, dp);
	dp->i_flag |= ICHG;
	dp->i_seq++;
	ufs_iupdat(dp, I_SYNC);
	return (err);
}

/*
 * Delete a directory entry.  If oip is nonzero the entry is checked
 * to make sure it still reflects oip.
 *
 * If vpp is non-null, return the ptr of the (held) vnode associated with
 * the removed name.  The caller is responsible for doing the VN_RELE().
 */
int
ufs_dirremove(
	struct inode *dp,
	char *namep,
	struct inode *oip,
	struct vnode *cdir,
	enum dr_op op,
	struct cred *cr,
	vnode_t **vpp)	/* Return (held) vnode ptr of removed file/dir */
{
	struct direct *ep, *pep, *nep;
	struct inode *ip;
	vnode_t *dvp, *vp;
	struct ufs_slot slot;
	int namlen;
	int err;
	int mode;
	ushort_t extra;

	namlen = (int)strlen(namep);
	if (namlen == 0)
		return (ufs_fault(ITOV(dp), "ufs_dirremove: namlen == 0"));
	/*
	 * return error when removing . and ..
	 */
	if (namep[0] == '.') {
		if (namlen == 1)
			return (EINVAL);
		else if (namlen == 2 && namep[1] == '.') {
			return (EEXIST);	/* SIGH should be ENOTEMPTY */
		}
	}

	ASSERT(RW_WRITE_HELD(&dp->i_rwlock));
	/*
	 * Check accessibility of directory.
	 */
retry:
	if (((dp->i_mode & IFMT) != IFDIR) &&
	    ((dp->i_mode & IFMT) != IFATTRDIR)) {
		return (ENOTDIR);
	}

	/*
	 * Execute access is required to search the directory.
	 * Access for write is interpreted as allowing
	 * deletion of files in the directory.
	 */
	if (err = ufs_iaccess(dp, IEXEC|IWRITE, cr)) {
		return (err);
	}

	ip = NULL;
	slot.fbp = NULL;
	slot.status = FOUND;	/* don't need to look for empty slot */
	rw_enter(&dp->i_ufsvfs->vfs_dqrwlock, RW_READER);
	rw_enter(&dp->i_contents, RW_WRITER);
	err = ufs_dircheckforname(dp, namep, namlen, &slot, &ip, cr, 0);
	if (err)
		goto out_novfs;
	if (ip == NULL) {
		err = ENOENT;
		goto out_novfs;
	}
	vp = ITOV(ip);
	if (oip && oip != ip) {
		err = ENOENT;
		goto out_novfs;
	}

	mode = ip->i_mode & IFMT;
	if (mode == IFDIR || mode == IFATTRDIR) {

		/*
		 * vn_vfsrlock() prevents races between mount and rmdir.
		 */
		if (vn_vfsrlock(vp)) {
			err = EBUSY;
			goto out_novfs;
		}
		if (vn_mountedvfs(vp) != NULL && op != DR_RENAME) {
			err = EBUSY;
			goto out;
		}
		/*
		 * If we are removing a directory, get a lock on it.
		 * Taking a writer lock prevents a parallel ufs_dirlook from
		 * incorrectly entering a negative cache vnode entry in the dnlc
		 * If the directory is empty, it will stay empty until
		 * we can remove it.
		 */
		if (!rw_tryenter(&ip->i_rwlock, RW_WRITER)) {
			/*
			 * It is possible that a thread in rename would have
			 * acquired this rwlock. To prevent a deadlock we
			 * do a rw_tryenter. If we fail to get the lock
			 * we drop all the locks we have acquired, wait
			 * for 2 ticks and reacquire the
			 * directory's (dp) i_rwlock and try again.
			 * If we dont drop dp's i_rwlock then we will panic
			 * with a "Deadlock: cycle in blocking chain"
			 * since in ufs_dircheckpath we want dp's i_rwlock.
			 * dp is guaranteed to exist since ufs_dirremove is
			 * called after a VN_HOLD(dp) has been done.
			 */
			ufs_dirremove_retry_cnt++;
			vn_vfsunlock(vp);
			if (slot.fbp)
				fbrelse(slot.fbp, S_OTHER);
			rw_exit(&dp->i_contents);
			rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);
			rw_exit(&dp->i_rwlock);
			VN_RELE(vp);
			delay(2);
			rw_enter(&dp->i_rwlock, RW_WRITER);
			goto retry;
		}
	}
	rw_enter(&ip->i_contents, RW_READER);

	/*
	 * Now check the restrictions that apply on sticky directories.
	 */
	if ((err = ufs_sticky_remove_access(dp, ip, cr)) != 0) {
		rw_exit(&ip->i_contents);
		if (mode == IFDIR || mode == IFATTRDIR)
			rw_exit(&ip->i_rwlock);
		goto out;
	}

	if (op == DR_RMDIR) {
		/*
		 * For rmdir(2), some special checks are required.
		 * (a) Don't remove any alias of the parent (e.g. ".").
		 * (b) Don't remove the current directory.
		 * (c) Make sure the entry is (still) a directory.
		 * (d) Make sure the directory is empty.
		 */

		if (dp == ip || vp == cdir)
			err = EINVAL;
		else if (((ip->i_mode & IFMT) != IFDIR) &&
		    ((ip->i_mode & IFMT) != IFATTRDIR))
			err = ENOTDIR;
		else if ((ip->i_nlink > 2) ||
		    !ufs_dirempty(ip, dp->i_number, cr)) {
			err = EEXIST;	/* SIGH should be ENOTEMPTY */
		}

		if (err) {
			rw_exit(&ip->i_contents);
			if (mode == IFDIR || mode == IFATTRDIR)
				rw_exit(&ip->i_rwlock);
			goto out;
		}
	} else if (op == DR_REMOVE)  {
		/*
		 * unlink(2) requires a different check: allow only
		 * privileged users to unlink a directory.
		 */
		if (vp->v_type == VDIR &&
		    secpolicy_fs_linkdir(cr, vp->v_vfsp)) {
			err = EPERM;
			rw_exit(&ip->i_contents);
			rw_exit(&ip->i_rwlock);
			goto out;
		}
	}

	rw_exit(&ip->i_contents);

	/*
	 * Remove the cache'd entry, if any.
	 */
	dvp = ITOV(dp);
	dnlc_remove(dvp, namep);
	ep = slot.ep;
	ep->d_ino = 0;

	if (slot.cached) {
		dcanchor_t *dcap = &dp->i_danchor;

		(void) dnlc_dir_rem_entry(dcap, namep, NULL);
		if (((int)ep->d_reclen - (int)DIRSIZ(ep)) >= LDIRSIZ(1)) {
			(void) dnlc_dir_rem_space_by_handle(dcap, slot.offset);
		}
		if (slot.offset & (DIRBLKSIZ - 1)) {
			/*
			 * Collapse new free space into previous entry.
			 * Note, the previous entry has already been
			 * validated in ufs_dircheckforname().
			 */
			ASSERT(slot.size);
			pep = (struct direct *)((char *)ep - slot.size);
			if ((pep->d_ino == 0) &&
			    ((uintptr_t)pep & (DIRBLKSIZ - 1))) {
				dnlc_dir_purge(dcap);
				slot.cached = 0;
				goto nocache;
			}
			if (pep->d_ino) {
				extra = pep->d_reclen - DIRSIZ(pep);
			} else {
				extra = pep->d_reclen;
			}
			if (extra >= LDIRSIZ(1)) {
				(void) dnlc_dir_rem_space_by_handle(dcap,
				    (uint64_t)(slot.offset - slot.size));
			}
			pep->d_reclen += ep->d_reclen;
			(void) dnlc_dir_add_space(dcap, extra + ep->d_reclen,
				(uint64_t)(slot.offset - slot.size));
			/* adjust the previous pointer in the next entry */
			nep = (struct direct *)((char *)ep + ep->d_reclen);
			if ((uintptr_t)nep & (DIRBLKSIZ - 1)) {
				/*
				 * Not a new block.
				 *
				 * Check the validity of the entry.
				 * If it's bad, then throw away the cache and
				 * continue.
				 */
				if ((nep->d_reclen == 0) ||
				    (nep->d_reclen & 0x3) ||
				    (dnlc_dir_update(dcap, nep->d_name,
				    INO_OFF_TO_H(nep->d_ino,
				    slot.offset - slot.size)) == DNOENT)) {
					dnlc_dir_purge(dcap);
					slot.cached = 0;
				}
			}
		} else {
			(void) dnlc_dir_add_space(dcap, ep->d_reclen,
			(uint64_t)slot.offset);
		}
	} else {
		/*
		 * If the entry isn't the first in the directory, we must
		 * reclaim the space of the now empty record by adding
		 * the record size to the size of the previous entry.
		 */
		if (slot.offset & (DIRBLKSIZ - 1)) {
			/*
			 * Collapse new free space into previous entry.
			 */
			pep = (struct direct *)((char *)ep - slot.size);
			pep->d_reclen += ep->d_reclen;
		}
	}
nocache:


	err = TRANS_DIR(dp, slot.offset);
	if (err)
		fbrelse(slot.fbp, S_OTHER);
	else
		err = ufs_fbwrite(slot.fbp, dp);
	slot.fbp = NULL;

	/*
	 * If we were removing a directory, it is 'gone' now, but we cannot
	 * unlock it as a thread may be waiting for the lock in ufs_create. If
	 * we did, it could then create a file in a deleted directory.
	 */

	if (err) {
		if (mode == IFDIR || mode == IFATTRDIR)
			rw_exit(&ip->i_rwlock);
		goto out;
	}

	rw_enter(&ip->i_contents, RW_WRITER);

	dp->i_flag |= IUPD|ICHG;
	dp->i_seq++;
	ip->i_flag |= ICHG;
	ip->i_seq++;

	TRANS_INODE(dp->i_ufsvfs, dp);
	TRANS_INODE(ip->i_ufsvfs, ip);
	/*
	 * Now dispose of the inode.
	 */
	if (ip->i_nlink > 0) {
		/*
		 * This is not done for IFATTRDIR's because they don't
		 * have entries in the dnlc and the link counts are
		 * not incremented when they are created.
		 */
		if (op == DR_RMDIR && (ip->i_mode & IFMT) == IFDIR) {
			/*
			 * Decrement by 2 because we're trashing the "."
			 * entry as well as removing the entry in dp.
			 * Clear the directory entry, but there may be
			 * other hard links so don't free the inode.
			 * Decrement the dp linkcount because we're
			 * trashing the ".." entry.
			 */
			ip->i_nlink -= 2;
			dp->i_nlink--;
			ufs_setreclaim(dp);
			/*
			 * XXX need to discard negative cache entries
			 * for vp.  See comment in ufs_delete().
			 */
			dnlc_remove(vp, ".");
			dnlc_remove(vp, "..");
			/*
			 * The return value is ignored here bacause if
			 * the directory purge fails we don't want to
			 * stop the delete. If ufs_dirpurgedotdot fails
			 * the delete will continue with the preexiting
			 * behavior.
			 */
			(void) ufs_dirpurgedotdot(ip, dp->i_number, cr);
		} else {
			ip->i_nlink--;
		}
		ufs_setreclaim(ip);
	}
	ITIMES_NOLOCK(dp);
	ITIMES_NOLOCK(ip);

	if (!TRANS_ISTRANS(dp->i_ufsvfs))
		ufs_iupdat(dp, I_SYNC);
	if (!TRANS_ISTRANS(ip->i_ufsvfs))
		ufs_iupdat(ip, I_SYNC);

	rw_exit(&ip->i_contents);
	if (mode == IFDIR || mode == IFATTRDIR)
		rw_exit(&ip->i_rwlock);
out:
	if (mode == IFDIR || mode == IFATTRDIR) {
		vn_vfsunlock(vp);
	}
out_novfs:
	ASSERT(RW_WRITE_HELD(&dp->i_contents));

	if (slot.fbp)
		fbrelse(slot.fbp, S_OTHER);

	rw_exit(&dp->i_contents);
	rw_exit(&dp->i_ufsvfs->vfs_dqrwlock);

	/*
	 * If no error and vpp is non-NULL, return the vnode ptr to the caller.
	 * The caller becomes responsible for the VN_RELE().  Otherwise,
	 * Release (and delete) the inode after we drop vfs_dqrwlock to
	 * avoid deadlock since ufs_delete() grabs vfs_dqrwlock as reader.
	 */
	if (ip) {
		if ((err == 0) && (vpp != NULL)) {
			*vpp = ITOV(ip);
		} else {
			VN_RELE(vp);
		}
	}

	return (err);
}

/*
 * Return buffer with contents of block "offset"
 * from the beginning of directory "ip".  If "res"
 * is non-zero, fill it in with a pointer to the
 * remaining space in the directory.
 *
 */

int
blkatoff(
	struct inode *ip,
	off_t offset,
	char **res,
	struct fbuf **fbpp)
{
	struct fs *fs;
	struct fbuf *fbp;
	daddr_t lbn;
	uint_t bsize;
	int err;

	CPU_STATS_ADD_K(sys, ufsdirblk, 1);
	fs = ip->i_fs;
	lbn = (daddr_t)lblkno(fs, offset);
	bsize = (uint_t)blksize(fs, ip, lbn);
	err = fbread(ITOV(ip), (offset_t)(offset & fs->fs_bmask),
			bsize, S_READ, &fbp);
	if (err) {
		*fbpp = (struct fbuf *)NULL;
		return (err);
	}
	if (res)
		*res = fbp->fb_addr + blkoff(fs, offset);
	*fbpp = fbp;
	return (0);
}

/*
 * Do consistency checking:
 *	record length must be multiple of 4
 *	entry must fit in rest of its DIRBLKSIZ block
 *	record must be large enough to contain entry
 *	name is not longer than MAXNAMLEN
 *	name must be as long as advertised, and null terminated
 * NOTE: record length must not be zero (should be checked previously).
 *       This routine is only called if dirchk is true.
 *       It would be nice to set the FSBAD flag in the super-block when
 *       this routine fails so that a fsck is forced on next reboot,
 *       but locking is a problem.
 */
static int
dirmangled(
	struct inode *dp,
	struct direct *ep,
	int entryoffsetinblock,
	off_t offset)
{
	int i;

	i = DIRBLKSIZ - (entryoffsetinblock & (DIRBLKSIZ - 1));
	if ((ep->d_reclen & 0x3) != 0 || (int)ep->d_reclen > i ||
	    (uint_t)ep->d_reclen < DIRSIZ(ep) || ep->d_namlen > MAXNAMLEN ||
	    ep->d_ino && dirbadname(ep->d_name, (int)ep->d_namlen)) {
		dirbad(dp, "mangled entry", offset);
		return (1);
	}
	return (0);
}

static void
dirbad(struct inode *ip, char *how, off_t offset)
{
	cmn_err(CE_NOTE, "%s: bad dir ino %d at offset %ld: %s",
	    ip->i_fs->fs_fsmnt, (int)ip->i_number, offset, how);
}

static int
dirbadname(char *sp, int l)
{
	while (l--) {			/* check for nulls */
		if (*sp++ == '\0') {
			return (1);
		}
	}
	return (*sp);			/* check for terminating null */
}

/*
 * Check if a directory is empty or not.
 */
static int
ufs_dirempty(
	struct inode *ip,
	ino_t parentino,
	struct cred *cr)
{
	return (ufs_dirscan(ip, parentino, cr, 0));
}

/*
 * clear the .. directory entry.
 */
static int
ufs_dirpurgedotdot(
	struct inode *ip,
	ino_t parentino,
	struct cred *cr)
{
	return (ufs_dirscan(ip, parentino, cr, 1));
}

/*
 * Scan the directoy. If clr_dotdot is true clear the ..
 * directory else check to see if the directory is empty.
 *
 * Using a struct dirtemplate here is not precisely
 * what we want, but better than using a struct direct.
 *
 * clr_dotdot is used as a flag to tell us if we need
 * to clear the dotdot entry
 *
 * N.B.: does not handle corrupted directories.
 */
static int
ufs_dirscan(
	struct inode *ip,
	ino_t parentino,
	struct cred *cr,
	int clr_dotdot)
{
	offset_t off;
	struct dirtemplate dbuf;
	struct direct *dp = (struct direct *)&dbuf;
	int err, count;
	int empty = 1;	/* Assume it's empty */
#define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)

	ASSERT(RW_LOCK_HELD(&ip->i_contents));

	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
		/*
		 * Since we read MINDIRSIZ, residual must
		 * be 0 unless we're at end of file.
		 */
		if (err || count != 0 || dp->d_reclen == 0) {
			empty = 0;
			break;
		}
		/* skip empty entries */
		if (dp->d_ino == 0)
			continue;
		/* accept only "." and ".." */
		if (dp->d_namlen > 2 || dp->d_name[0] != '.') {
			empty = 0;
			break;
		}
		/*
		 * At this point d_namlen must be 1 or 2.
		 * 1 implies ".", 2 implies ".." if second
		 * char is also "."
		 */
		if (dp->d_namlen == 1)
			continue;
		if (dp->d_name[1] == '.' &&
		    (ino_t)dp->d_ino == parentino) {
			/*
			 * If we're doing a purge we need to check for
			 * the . and .. entries and clear the d_ino for ..
			 *
			 * if clr_dotdot is set ufs_dirscan does not
			 * check for an empty directory.
			 */
			if (clr_dotdot) {
				/*
				 * Have to actually zap the ..
				 * entry in the directory, as
				 * otherwise someone might have
				 * dp as its cwd and try to
				 * open .., which now points to
				 * an unallocated inode.
				 */
				empty = ufs_dirclrdotdot(ip, parentino);
				break;
			} else {
				continue;
			}
		}
		empty = 0;
		break;
	}
	return (empty);
}

clock_t retry_backoff_delay = 1; /* delay before retrying the i_rwlock */
uint64_t dircheck_retry_cnt;
/*
 * Check if source directory inode is in the path of the target directory.
 * Target is supplied locked.
 *
 * The source and target inode's should be different upon entry.
 */
int
ufs_dircheckpath(
	ino_t source_ino,
	struct inode *target,
	struct inode *sdp,
	struct cred *cr)
{
	struct fbuf *fbp;
	struct dirtemplate *dirp;
	struct inode *ip;
	struct ufsvfs *ufsvfsp;
	struct inode *tip;
	ino_t dotdotino;
	int err;

	ASSERT(target->i_ufsvfs != NULL);
	ASSERT(RW_LOCK_HELD(&target->i_rwlock));
	ASSERT(RW_LOCK_HELD(&sdp->i_rwlock));

	ip = target;
	if (ip->i_number == source_ino) {
		err = EINVAL;
		goto out;
	}
	if (ip->i_number == UFSROOTINO) {
		err = 0;
		goto out;
	}
	/*
	 * Search back through the directory tree, using the ".." entries.
	 * Fail any attempt to move a directory into an ancestor directory.
	 */
	fbp = NULL;
	for (;;) {
		struct vfs	*vfs;

		err = blkatoff(ip, (off_t)0, (char **)&dirp, &fbp);
		if (err)
			break;
		if (((ip->i_mode & IFMT) != IFDIR) || ip->i_nlink == 0 ||
		    ip->i_size < sizeof (struct dirtemplate)) {
			dirbad(ip, "bad size, unlinked or not dir", (off_t)0);
			err = ENOTDIR;
			break;
		}
		if (dirp->dotdot_namlen != 2 ||
		    dirp->dotdot_name[0] != '.' ||
		    dirp->dotdot_name[1] != '.') {
			dirbad(ip, "mangled .. entry", (off_t)0);
			err = ENOTDIR;		/* Sanity check */
			break;
		}
		dotdotino = (ino_t)dirp->dotdot_ino;
		if (dotdotino == source_ino) {
			err = EINVAL;
			break;
		}
		if (dotdotino == UFSROOTINO)
			break;
		if (fbp) {
			fbrelse(fbp, S_OTHER);
			fbp = NULL;
		}
		vfs = ip->i_vfs;
		ufsvfsp = ip->i_ufsvfs;

		if (ip != target) {
			rw_exit(&ip->i_rwlock);
			VN_RELE(ITOV(ip));
		}
		/*
		 * Race to get the inode.
		 */
		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
		if (err = ufs_iget_alloced(vfs, dotdotino, &tip, cr)) {
			rw_exit(&ufsvfsp->vfs_dqrwlock);
			ip = NULL;
			break;
		}
		rw_exit(&ufsvfsp->vfs_dqrwlock);
		/*
		 * If the directory of the source inode (also a directory)
		 * is the same as this next entry up the chain, then
		 * we know the source directory itself can't be in the
		 * chain. This also prevents a panic because we already
		 * have sdp->i_rwlock locked.
		 */
		if (tip == sdp) {
			VN_RELE(ITOV(tip));
			ip = NULL;
			break;
		}
		ip = tip;

		/*
		 * If someone has set the WRITE_WANTED bit in this lock and if
		 * this happens to be a sdp or tdp of another parallel rename
		 * which is executing  the same code and in similar situation
		 * we end up in a 4 way deadlock. We need to make sure that
		 * the WRITE_WANTED bit is not  set.
		 */
retry_lock:
		if (!rw_tryenter(&ip->i_rwlock, RW_READER)) {
			/*
			 * If the lock held as WRITER thats fine but if it
			 * has WRITE_WANTED bit set we might end up in a
			 * deadlock. If WRITE_WANTED is set we return
			 * with EAGAIN else we just go back and try.
			 */
			if (RW_ISWRITER(&ip->i_rwlock) &&
					!(RW_WRITE_HELD(&ip->i_rwlock))) {
				err = EAGAIN;
				if (fbp) {
					fbrelse(fbp, S_OTHER);
				}
				VN_RELE(ITOV(ip));
				return (err);
			} else {
				/*
				 * The lock is being write held. We could
				 * just do a rw_enter here but there is a
				 * window between the check and now, where
				 * the status could have changed, so to
				 * avoid looping we backoff and go back to
				 * try for the lock.
				 */
				delay(retry_backoff_delay);
				dircheck_retry_cnt++;
				goto retry_lock;
			}
		}
	}
	if (fbp) {
		fbrelse(fbp, S_OTHER);
	}
out:
	if (ip) {
		if (ip != target) {
			rw_exit(&ip->i_rwlock);
			VN_RELE(ITOV(ip));
		}
	}
	return (err);
}

int
ufs_xattrdirempty(struct inode *ip, ino_t parentino, struct cred *cr)
{
	offset_t off;
	struct dirtemplate dbuf;
	struct direct *dp = (struct direct *)&dbuf;
	int err, count;
	int empty = 1;	/* Assume it's empty */
#define	MINDIRSIZ (sizeof (struct dirtemplate) / 2)

	ASSERT(RW_LOCK_HELD(&ip->i_contents));

	ASSERT(ip->i_size <= (offset_t)MAXOFF_T);
	for (off = 0; off < ip->i_size; off += dp->d_reclen) {
		err = ufs_rdwri(UIO_READ, FREAD, ip, (caddr_t)dp,
		    (ssize_t)MINDIRSIZ, off, UIO_SYSSPACE, &count, cr);
		/*
		 * Since we read MINDIRSIZ, residual must
		 * be 0 unless we're at end of file.
		 */

		if (err || count != 0 || dp->d_reclen == 0) {
			empty = 0;
			break;
		}
		/* skip empty entries */
		if (dp->d_ino == 0)
			continue;
		/*
		 * At this point d_namlen must be 1 or 2.
		 * 1 implies ".", 2 implies ".." if second
		 * char is also "."
		 */

		if (dp->d_namlen == 1 && dp->d_name[0] == '.' &&
				(ino_t)dp->d_ino == parentino)
			continue;

		if (dp->d_namlen == 2 && dp->d_name[0] == '.' &&
			dp->d_name[1] == '.') {
			continue;
		}
		empty = 0;
		break;
	}
	return (empty);
}


/*
 * Allocate and initialize a new shadow inode to contain extended attributes.
 */
int
ufs_xattrmkdir(
	struct inode *tdp,
	struct inode **ipp,
	int flags,
	struct cred *cr)
{
	struct inode *ip;
	struct vattr va;
	int err;
	int retry = 1;
	struct ufsvfs *ufsvfsp;
	struct ulockfs *ulp;
	int issync;
	int trans_size;
	int dorwlock;		/* 0 = not yet taken, */
				/* 1 = taken outside the transaction, */
				/* 2 = taken inside the transaction */

	/*
	 * Validate permission to create attribute directory
	 */

	if ((err = ufs_iaccess(tdp, IWRITE, cr)) != 0) {
		return (err);
	}

	if (vn_is_readonly(ITOV(tdp)))
		return (EROFS);

	/*
	 * No need to re-init err after again:, since it's set before
	 * the next use of it.
	 */
again:
	dorwlock = 0;
	va.va_type = VDIR;
	va.va_uid = tdp->i_uid;
	va.va_gid = tdp->i_gid;

	if ((tdp->i_mode & IFMT) == IFDIR) {
		va.va_mode = (o_mode_t)IFATTRDIR;
		va.va_mode |= tdp->i_mode & 0777;
	} else {
		va.va_mode = (o_mode_t)IFATTRDIR|0700;
		if (tdp->i_mode & 0040)
			va.va_mode |= 0750;
		if (tdp->i_mode & 0004)
			va.va_mode |= 0705;
	}
	va.va_mask = AT_TYPE|AT_MODE;

	ufsvfsp = tdp->i_ufsvfs;

	err = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_MKDIR_MASK);
	if (err)
		return (err);

	/*
	 * Acquire i_rwlock before TRANS_BEGIN_CSYNC() if this is a file.
	 * This follows the protocol for read()/write().
	 */
	if (ITOV(tdp)->v_type != VDIR) {
		rw_enter(&tdp->i_rwlock, RW_WRITER);
		dorwlock = 1;
	}

	if (ulp) {
		trans_size = (int)TOP_MKDIR_SIZE(tdp);
		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_MKDIR, trans_size);
	}

	/*
	 * Acquire i_rwlock after TRANS_BEGIN_CSYNC() if this is a directory.
	 * This follows the protocol established by
	 * ufs_link/create/remove/rename/mkdir/rmdir/symlink.
	 */
	if (dorwlock == 0) {
		rw_enter(&tdp->i_rwlock, RW_WRITER);
		dorwlock = 2;
	}
	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
	rw_enter(&tdp->i_contents, RW_WRITER);

	/*
	 * Suppress out of inodes messages if we will retry.
	 */
	if (retry)
		tdp->i_flag |= IQUIET;
	err = ufs_dirmakeinode(tdp, &ip, &va, DE_ATTRDIR, cr);
	tdp->i_flag &= ~IQUIET;

	if (err)
		goto fail;

	if (flags) {

		/*
		 * Now attach it to src file.
		 */

		tdp->i_oeftflag = ip->i_number;
	}

	ip->i_cflags |= IXATTR;
	ITOV(ip)->v_flag |= V_XATTRDIR;
	TRANS_INODE(ufsvfsp, tdp);
	tdp->i_flag |= ICHG | IUPD;
	tdp->i_seq++;
	ufs_iupdat(tdp, I_SYNC);
	rw_exit(&tdp->i_contents);
	rw_exit(&ufsvfsp->vfs_dqrwlock);

	rw_enter(&ip->i_rwlock, RW_WRITER);
	rw_enter(&ip->i_contents, RW_WRITER);
	TRANS_INODE(ufsvfsp, ip);
	ip->i_flag |= ICHG| IUPD;
	ip->i_seq++;
	ufs_iupdat(ip, I_SYNC);
	rw_exit(&ip->i_contents);
	rw_exit(&ip->i_rwlock);
	if (dorwlock == 2)
		rw_exit(&tdp->i_rwlock);
	if (ulp) {
		int terr = 0;

		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
		ufs_lockfs_end(ulp);
		if (err == 0)
			err = terr;
	}
	if (dorwlock == 1)
		rw_exit(&tdp->i_rwlock);
	*ipp = ip;
	return (err);

fail:
	rw_exit(&tdp->i_contents);
	rw_exit(&ufsvfsp->vfs_dqrwlock);
	if (dorwlock == 2)
		rw_exit(&tdp->i_rwlock);
	if (ulp) {
		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_MKDIR, trans_size);
		ufs_lockfs_end(ulp);
	}
	if (dorwlock == 1)
		rw_exit(&tdp->i_rwlock);
	if (ip != NULL)
		VN_RELE(ITOV(ip));

	/*
	 * No inodes?  See if any are tied up in pending deletions.
	 * This has to be done outside of any of the above, because
	 * the draining operation can't be done from inside a transaction.
	 */
	if ((err == ENOSPC) && retry && TRANS_ISTRANS(ufsvfsp)) {
		ufs_delete_drain_wait(ufsvfsp, 1);
		retry = 0;
		goto again;
	}

	return (err);
}

/*
 * clear the dotdot directory entry.
 * Used by ufs_dirscan when clr_dotdot
 * flag is set and we're deleting a
 * directory.
 */
static int
ufs_dirclrdotdot(struct inode *ip, ino_t parentino)
{
	struct fbuf *fbp;
	struct direct *dotp, *dotdotp;
	int err = 0;

	ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
	ASSERT(RW_LOCK_HELD(&ip->i_contents));
	err = blkatoff(ip, 0, NULL, &fbp);
	if (err) {
		return (err);
	}

	dotp = (struct direct *)fbp->fb_addr;
	if ((dotp->d_namlen < (MAXNAMLEN + 1)) &&
	    ((DIRBLKSIZ - DIRSIZ(dotp)) >= (sizeof (struct dirtemplate) / 2))) {
		dotdotp = (struct direct *)((char *)dotp + dotp->d_reclen);
		if ((dotdotp->d_namlen < (MAXNAMLEN + 1)) &&
		    ((DIRBLKSIZ - DIRSIZ(dotp)) >= dotdotp->d_reclen)) {

			dotp->d_reclen += dotdotp->d_reclen;
			if (parentino == dotdotp->d_ino) {
				dotdotp->d_ino = 0;
				dotdotp->d_namlen = 0;
				dotdotp->d_reclen = 0;
			}

			err = TRANS_DIR(ip, 0);
			if (err) {
				fbrelse(fbp, S_OTHER);
			} else {
				err = ufs_fbwrite(fbp, ip);
			}
		}
	} else {
		err = -1;
	}
	return (err);
}