/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
#pragma ident	"%Z%%M%	%I%	%E% SMI"

/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/systm.h>
#include <sys/types.h>
#include <sys/vnode.h>
#include <sys/errno.h>
#include <sys/sysmacros.h>
#include <sys/debug.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/proc.h>
#include <sys/cmn_err.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_filio.h>
#include <sys/fs/ufs_log.h>
#include <sys/inttypes.h>
#include <sys/atomic.h>
#include <sys/tuneable.h>

/*
 * externs
 */
extern pri_t minclsyspri;
extern struct kmem_cache *lufs_bp;
extern int ufs_trans_push_quota();

/*
 * globals
 */
kmem_cache_t *mapentry_cache;

/*
 * logmap tuning constants
 */
long	logmap_maxnme_commit	= 2048;
long	logmap_maxnme_async	= 4096;
long	logmap_maxnme_sync	= 6144;
long	logmap_maxcfrag_commit	= 4;	/* Max canceled fragments per moby */


uint64_t ufs_crb_size = 0;		/* current size of all crb buffers */
uint64_t ufs_crb_max_size = 0;		/* highest crb buffer use so far */
size_t ufs_crb_limit;			/* max allowable size for crbs */
uint64_t ufs_crb_alloc_fails = 0;	/* crb allocation failures stat */
#define	UFS_MAX_CRB_DEFAULT_DIVISOR 10	/* max 1/10 kmem_maxavail() */
int ufs_max_crb_divisor = UFS_MAX_CRB_DEFAULT_DIVISOR; /* tunable */
void handle_dquot(mapentry_t *);

/*
 * GENERIC MAP ROUTINES
 */

#define	CRB_FREE(crb, me) \
	kmem_free(crb->c_buf, crb->c_nb); \
	atomic_add_64(&ufs_crb_size, -(uint64_t)crb->c_nb); \
	kmem_free(crb, sizeof (crb_t)); \
	(me)->me_crb = NULL;

#define	CRB_RELE(me) { \
	crb_t *crb = (me)->me_crb; \
	if (crb && (--crb->c_refcnt == 0)) { \
		CRB_FREE(crb, me) \
	} \
}

/*
 * Check that the old delta has an argument and a push function of
 * ufs_trans_push_quota(), then check that the old and new deltas differ.
 * If so we clean up with handle_dquot() before replacing the old delta.
 */
#define	HANDLE_DQUOT(me, melist) { \
	if ((me->me_arg) && \
	    (me->me_func == ufs_trans_push_quota)) { \
		if (!((me->me_dt == melist->me_dt) && \
		    (me->me_arg == melist->me_arg) && \
		    (me->me_func == melist->me_func))) { \
			handle_dquot(me); \
		} \
	} \
}

/*
 * free up all the mapentries for a map
 */
void
map_free_entries(mt_map_t *mtm)
{
	int		i;
	mapentry_t	*me;

	while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
		me->me_next->me_prev = me->me_prev;
		me->me_prev->me_next = me->me_next;
		CRB_RELE(me);
		kmem_cache_free(mapentry_cache, me);
	}
	for (i = 0; i < mtm->mtm_nhash; i++)
		mtm->mtm_hash[i] = NULL;
	mtm->mtm_nme = 0;
	mtm->mtm_nmet = 0;
}

/*
 * done with map; free if necessary
 */
mt_map_t *
map_put(mt_map_t *mtm)
{
	/*
	 * free up the map's memory
	 */
	map_free_entries(mtm);
	ASSERT(map_put_debug(mtm));
	kmem_free(mtm->mtm_hash,
		(size_t) (sizeof (mapentry_t *) * mtm->mtm_nhash));
	mutex_destroy(&mtm->mtm_mutex);
	mutex_destroy(&mtm->mtm_scan_mutex);
	cv_destroy(&mtm->mtm_to_roll_cv);
	cv_destroy(&mtm->mtm_from_roll_cv);
	rw_destroy(&mtm->mtm_rwlock);
	mutex_destroy(&mtm->mtm_lock);
	cv_destroy(&mtm->mtm_cv_commit);
	cv_destroy(&mtm->mtm_cv_next);
	cv_destroy(&mtm->mtm_cv_eot);
	cv_destroy(&mtm->mtm_cv);
	kmem_free(mtm, sizeof (mt_map_t));
	return (NULL);
}
/*
 * Allocate a map;
 */
mt_map_t *
map_get(ml_unit_t *ul, enum maptypes maptype, int nh)
{
	mt_map_t	*mtm;

	/*
	 * assume the map is not here and allocate the necessary structs
	 */
	mtm = kmem_zalloc(sizeof (mt_map_t), KM_SLEEP);
	mutex_init(&mtm->mtm_mutex, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&mtm->mtm_scan_mutex, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&mtm->mtm_to_roll_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&mtm->mtm_from_roll_cv, NULL, CV_DEFAULT, NULL);
	rw_init(&mtm->mtm_rwlock, NULL, RW_DEFAULT, NULL);
	mtm->mtm_next = (mapentry_t *)mtm;
	mtm->mtm_prev = (mapentry_t *)mtm;
	mtm->mtm_hash = kmem_zalloc((size_t) (sizeof (mapentry_t *) * nh),
	    KM_SLEEP);
	mtm->mtm_nhash = nh;
	mtm->mtm_debug = ul->un_debug;
	mtm->mtm_type = maptype;

	mtm->mtm_cfrags = 0;
	mtm->mtm_cfragmax = logmap_maxcfrag_commit;

	/*
	 * for scan test
	 */
	mtm->mtm_ul = ul;

	/*
	 * Initialize locks
	 */
	mutex_init(&mtm->mtm_lock, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&mtm->mtm_cv_commit, NULL, CV_DEFAULT, NULL);
	cv_init(&mtm->mtm_cv_next, NULL, CV_DEFAULT, NULL);
	cv_init(&mtm->mtm_cv_eot, NULL, CV_DEFAULT, NULL);
	cv_init(&mtm->mtm_cv, NULL, CV_DEFAULT, NULL);
	ASSERT(map_get_debug(ul, mtm));

	return (mtm);
}

/*
 * DELTAMAP ROUTINES
 */
/*
 * deltamap tuning constants
 */
long	deltamap_maxnme	= 1024;	/* global so it can be set */

int
deltamap_need_commit(mt_map_t *mtm)
{
	return (mtm->mtm_nme > deltamap_maxnme);
}

/*
 * put a delta into a deltamap; may sleep on memory
 */
void
deltamap_add(
	mt_map_t *mtm,
	offset_t mof,
	off_t nb,
	delta_t dtyp,
	int (*func)(),
	ulong_t arg,
	threadtrans_t *tp)
{
	int32_t		hnb;
	mapentry_t	*me;
	mapentry_t	**mep;

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

	mutex_enter(&mtm->mtm_mutex);

	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
		if (hnb > nb)
			hnb = nb;
		/*
		 * Search for dup entry. We need to ensure that we don't
		 * replace a map entry which carries quota information
		 * with a map entry which doesn't. In that case we lose
		 * reference the the dquot structure which will not be
		 * cleaned up by the push function me->me_func as this will
		 * never be called.
		 * The stray dquot would be found later by invalidatedq()
		 * causing a panic when the filesystem is unmounted.
		 */
		mep = MAP_HASH(mof, mtm);
		for (me = *mep; me; me = me->me_hash) {
			if (DATAwithinME(mof, hnb, me)) {
			    if (me->me_func == ufs_trans_push_quota) {
				/*
				 * Don't remove quota entries which have
				 * incremented the ref count (those with a
				 * ufs_trans_push_quota push function).
				 * Let logmap_add[_buf] clean them up.
				 */
				continue;
			    }
			    break;
			}
			ASSERT((dtyp == DT_CANCEL) ||
				(!DATAoverlapME(mof, hnb, me)) ||
				MEwithinDATA(me, mof, hnb));
		}

		if (me) {
			/* already in map */
			continue;
		}

		/*
		 * Add up all the delta map deltas so we can compute
		 * an upper bound on the log size used.
		 * Note, some deltas get removed from the deltamap
		 * before the deltamap_push by lufs_write_strategy
		 * and so multiple deltas to the same mof offset
		 * don't get cancelled here but in the logmap.
		 * Thus we can't easily get a accurate count of
		 * the log space used - only an upper bound.
		 */
		if (tp && (mtm->mtm_ul->un_deltamap == mtm)) {
			ASSERT(dtyp != DT_CANCEL);
			if (dtyp == DT_ABZERO) {
				tp->deltas_size += sizeof (struct delta);
			} else {
				tp->deltas_size +=
				    (hnb + sizeof (struct delta));
			}
		}

		delta_stats[dtyp]++;

		/*
		 * get a mapentry
		 * May need to drop & re-grab the mtm_mutex
		 * and then recheck for a duplicate
		 */
		me = kmem_cache_alloc(mapentry_cache, KM_NOSLEEP);
		if (me == NULL) {
			mutex_exit(&mtm->mtm_mutex);
			me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
			mutex_enter(&mtm->mtm_mutex);
		}
		bzero(me, sizeof (mapentry_t));

		/*
		 * initialize and put in deltamap
		 */
		me->me_mof = mof;
		me->me_nb = hnb;
		me->me_func = func;
		me->me_arg = arg;
		me->me_dt = dtyp;
		me->me_flags = ME_HASH;
		me->me_tid = mtm->mtm_tid;

		me->me_hash = *mep;
		*mep = me;
		me->me_next = (mapentry_t *)mtm;
		me->me_prev = mtm->mtm_prev;
		mtm->mtm_prev->me_next = me;
		mtm->mtm_prev = me;
		mtm->mtm_nme++;
	}
	mutex_exit(&mtm->mtm_mutex);

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));
}

/*
 * remove deltas within (mof, nb) and return as linked list
 */
mapentry_t *
deltamap_remove(mt_map_t *mtm, offset_t mof, off_t nb)
{
	off_t		hnb;
	mapentry_t	*me;
	mapentry_t	**mep;
	mapentry_t	*mer;

	if (mtm == NULL)
		return (NULL);

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

	mutex_enter(&mtm->mtm_mutex);
	for (mer = NULL, hnb = 0; nb; nb -= hnb, mof += hnb) {
		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
		if (hnb > nb)
			hnb = nb;
		/*
		 * remove entries from hash and return as a aged linked list
		 */
		mep = MAP_HASH(mof, mtm);
		while ((me = *mep) != 0) {
			if (MEwithinDATA(me, mof, hnb)) {
				*mep = me->me_hash;
				me->me_next->me_prev = me->me_prev;
				me->me_prev->me_next = me->me_next;
				me->me_hash = mer;
				mer = me;
				me->me_flags |= ME_LIST;
				me->me_flags &= ~ME_HASH;
				mtm->mtm_nme--;
			} else
				mep = &me->me_hash;
		}
	}
	mutex_exit(&mtm->mtm_mutex);

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

	return (mer);
}

/*
 * delete entries within (mof, nb)
 */
void
deltamap_del(mt_map_t *mtm, offset_t mof, off_t nb)
{
	mapentry_t	*me;
	mapentry_t	*menext;

	menext = deltamap_remove(mtm, mof, nb);
	while ((me = menext) != 0) {
		menext = me->me_hash;
		kmem_cache_free(mapentry_cache, me);
	}
}

/*
 * Call the indicated function to cause deltas to move to the logmap.
 * top_end_sync() is the only caller of this function and
 * it has waited for the completion of all threads, so there can
 * be no other activity in the deltamap. Therefore we don't need to
 * hold the deltamap lock.
 */
void
deltamap_push(ml_unit_t *ul)
{
	delta_t		dtyp;
	int		(*func)();
	ulong_t		arg;
	mapentry_t	*me;
	offset_t	mof;
	off_t		nb;
	mt_map_t	*mtm	= ul->un_deltamap;

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

	/*
	 * for every entry in the deltamap
	 */
	while ((me = mtm->mtm_next) != (mapentry_t *)mtm) {
		ASSERT(me->me_func);
		func = me->me_func;
		dtyp = me->me_dt;
		arg = me->me_arg;
		mof = me->me_mof;
		nb = me->me_nb;
		if ((ul->un_flags & LDL_ERROR) ||
		    (*func)(ul->un_ufsvfs, dtyp, arg))
			deltamap_del(mtm, mof, nb);
	}

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));
}

/*
 * LOGMAP ROUTINES
 */

int
logmap_need_commit(mt_map_t *mtm)
{
	return ((mtm->mtm_nmet > logmap_maxnme_commit) ||
		(mtm->mtm_cfrags >= mtm->mtm_cfragmax));
}

int
logmap_need_roll_async(mt_map_t *mtm)
{
	return (mtm->mtm_nme > logmap_maxnme_async);
}

int
logmap_need_roll_sync(mt_map_t *mtm)
{
	return (mtm->mtm_nme > logmap_maxnme_sync);
}

void
logmap_start_roll(ml_unit_t *ul)
{
	mt_map_t	*logmap	= ul->un_logmap;

	logmap_settail(logmap, ul);
	ASSERT(!(ul->un_flags & LDL_NOROLL));
	mutex_enter(&logmap->mtm_mutex);
	if ((logmap->mtm_flags & MTM_ROLL_RUNNING) == 0) {
		logmap->mtm_flags |= MTM_ROLL_RUNNING;
		logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_EXIT);
		(void) thread_create(NULL, 0, trans_roll, ul, 0, &p0,
		    TS_RUN, minclsyspri);
	}
	mutex_exit(&logmap->mtm_mutex);
}

void
logmap_kill_roll(ml_unit_t *ul)
{
	mt_map_t	*mtm	= ul->un_logmap;

	if (mtm == NULL)
		return;

	mutex_enter(&mtm->mtm_mutex);

	while (mtm->mtm_flags & MTM_ROLL_RUNNING) {
		mtm->mtm_flags |= MTM_ROLL_EXIT;
		cv_signal(&mtm->mtm_to_roll_cv);
		cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
	}
	mutex_exit(&mtm->mtm_mutex);
}

/*
 * kick the roll thread if it's not doing anything
 */
void
logmap_forceroll_nowait(mt_map_t *logmap)
{
	/*
	 * Don't need to lock mtm_mutex to read mtm_flags here as we
	 * don't care in the rare case when we get a transitional value
	 * of mtm_flags. Just by signalling the thread it will wakeup
	 * and notice it has too many logmap entries.
	 */
	ASSERT(!(logmap->mtm_ul->un_flags & LDL_NOROLL));
	if ((logmap->mtm_flags & MTM_ROLLING) == 0) {
		cv_signal(&logmap->mtm_to_roll_cv);
	}
}

/*
 * kick the roll thread and wait for it to finish a cycle
 */
void
logmap_forceroll(mt_map_t *mtm)
{
	mutex_enter(&mtm->mtm_mutex);
	if ((mtm->mtm_flags & MTM_FORCE_ROLL) == 0) {
		mtm->mtm_flags |= MTM_FORCE_ROLL;
		cv_signal(&mtm->mtm_to_roll_cv);
	}
	do {
		if ((mtm->mtm_flags & MTM_ROLL_RUNNING) == 0) {
			mtm->mtm_flags &= ~MTM_FORCE_ROLL;
			goto out;
		}
		cv_wait(&mtm->mtm_from_roll_cv, &mtm->mtm_mutex);
	} while (mtm->mtm_flags & MTM_FORCE_ROLL);
out:
	mutex_exit(&mtm->mtm_mutex);
}

/*
 * remove rolled deltas within (mof, nb) and free them
 */
void
logmap_remove_roll(mt_map_t *mtm, offset_t mof, off_t nb)
{
	int		dolock = 0;
	off_t		hnb;
	mapentry_t	*me;
	mapentry_t	**mep;
	offset_t	savmof	= mof;
	off_t		savnb	= nb;

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

again:
	if (dolock)
		rw_enter(&mtm->mtm_rwlock, RW_WRITER);
	mutex_enter(&mtm->mtm_mutex);
	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
		if (hnb > nb)
			hnb = nb;
		/*
		 * remove and free the rolled entries
		 */
		mep = MAP_HASH(mof, mtm);
		while ((me = *mep) != 0) {
			if ((me->me_flags & ME_ROLL) &&
			    (MEwithinDATA(me, mof, hnb))) {
				if (me->me_flags & ME_AGE) {
					ASSERT(dolock == 0);
					dolock = 1;
					mutex_exit(&mtm->mtm_mutex);
					mof = savmof;
					nb = savnb;
					goto again;
				}
				*mep = me->me_hash;
				me->me_next->me_prev = me->me_prev;
				me->me_prev->me_next = me->me_next;
				me->me_flags &= ~(ME_HASH|ME_ROLL);
				ASSERT(!(me->me_flags & ME_USER));
				mtm->mtm_nme--;
				/*
				 * cancelled entries are handled by someone else
				 */
				if ((me->me_flags & ME_CANCEL) == 0) {
					roll_stats[me->me_dt]++;
					CRB_RELE(me);
					kmem_cache_free(mapentry_cache, me);
				}
			} else
				mep = &me->me_hash;
		}
	}
	mutex_exit(&mtm->mtm_mutex);

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

	if (dolock)
		rw_exit(&mtm->mtm_rwlock);
}

/*
 * Find the disk offset of the next delta to roll.
 * Returns 0: no more deltas to roll or a transaction is being committed
 *	   1: a delta to roll has been found and *mofp points
 *	      to the master file disk offset
 */
int
logmap_next_roll(mt_map_t *logmap, offset_t *mofp)
{
	mapentry_t *me;

	ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(logmap));

	mutex_enter(&logmap->mtm_mutex);
	for (me = logmap->mtm_next; me != (mapentry_t *)logmap;
	    me = me->me_next) {
		/* already rolled */
		if (me->me_flags & ME_ROLL) {
			continue;
		}

		/* part of currently busy transaction; stop */
		if (me->me_tid == logmap->mtm_tid) {
			break;
		}

		/* part of commit-in-progress transaction; stop */
		if (me->me_tid == logmap->mtm_committid) {
			break;
		}

		/*
		 * We shouldn't see a DT_CANCEL mapentry whose
		 * tid != mtm_committid, or != mtm_tid since
		 * these are removed at the end of each committed
		 * transaction.
		 */
		ASSERT(!(me->me_dt == DT_CANCEL));

		*mofp = me->me_mof;
		mutex_exit(&logmap->mtm_mutex);
		return (1);
	}
	mutex_exit(&logmap->mtm_mutex);
	return (0);
}

/*
 * put mapentry on sorted age list
 */
static void
logmap_list_age(mapentry_t **age, mapentry_t *meadd)
{
	mapentry_t	*me;

	ASSERT(!(meadd->me_flags & (ME_AGE|ME_LIST)));

	for (me = *age; me; age = &me->me_agenext, me = *age) {
		if (me->me_age > meadd->me_age)
			break;
	}
	meadd->me_agenext = me;
	meadd->me_flags |= ME_AGE;
	*age = meadd;
}

/*
 * get a list of deltas within <mof, mof+nb>
 *	returns with mtm_rwlock held
 *	return value says whether the entire mof range is covered by deltas
 */
int
logmap_list_get(
	mt_map_t *mtm,
	offset_t mof,
	off_t nb,
	mapentry_t **age)
{
	off_t		hnb;
	mapentry_t	*me;
	mapentry_t	**mep;
	int		rwtype	= RW_READER;
	offset_t	savmof	= mof;
	off_t		savnb	= nb;
	int		entire	= 0;
	crb_t		*crb;

	mtm->mtm_ref = 1;
again:

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

	rw_enter(&mtm->mtm_rwlock, rwtype);
	*age = NULL;
	mutex_enter(&mtm->mtm_mutex);
	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
		if (hnb > nb)
			hnb = nb;
		/*
		 * find overlapping entries
		 */
		mep = MAP_HASH(mof, mtm);
		for (me = *mep; me; me = me->me_hash) {
			if (me->me_dt == DT_CANCEL)
				continue;
			if (!DATAoverlapME(mof, hnb, me))
				continue;
			/*
			 * check if map entry is in use
			 * (about to be rolled).
			 */
			if (me->me_flags & ME_AGE) {
				/*
				 * reset the age bit in the list,
				 * upgrade the lock, and try again
				 */
				for (me = *age; me; me = *age) {
					*age = me->me_agenext;
					me->me_flags &= ~ME_AGE;
				}
				mutex_exit(&mtm->mtm_mutex);
				rw_exit(&mtm->mtm_rwlock);
				rwtype = RW_WRITER;
				mof = savmof;
				nb = savnb;
				entire = 0;
				goto again;
			} else {
				/* add mapentry to age ordered list */
				logmap_list_age(age, me);
				crb = me->me_crb;
				if (crb) {
					if (DATAwithinCRB(savmof, savnb, crb)) {
						entire = 1;
					}
				} else {
					if (DATAwithinME(savmof, savnb, me)) {
						entire = 1;
					}
				}
			}
		}
	}
	mutex_exit(&mtm->mtm_mutex);

	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
	return (entire);
}

/*
 * Get a list of deltas for rolling - returns sucess or failure.
 * Also return the cached roll buffer if all deltas point to it.
 */
int
logmap_list_get_roll(mt_map_t *logmap, offset_t mof, rollbuf_t *rbp)
{
	mapentry_t	*me, **mep, *age = NULL;
	crb_t		*crb = NULL;

	ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
	ASSERT(((logmap->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(logmap));
	ASSERT((mof & MAPBLOCKOFF) == 0);

	rbp->rb_crb = NULL;

	/*
	 * find overlapping entries
	 */
	mutex_enter(&logmap->mtm_mutex);
	mep = MAP_HASH(mof, logmap);
	for (me = *mep; me; me = me->me_hash) {
		if (!DATAoverlapME(mof, MAPBLOCKSIZE, me))
			continue;
		if (me->me_tid == logmap->mtm_tid)
			continue;
		if (me->me_tid == logmap->mtm_committid)
			continue;
		if (me->me_dt == DT_CANCEL)
			continue;

		/*
		 * Check if map entry is in use (by lufs_read_strategy())
		 * and if so reset the age bit in the list,
		 * upgrade the lock, and try again
		 */
		if (me->me_flags & ME_AGE) {
			for (me = age; me; me = age) {
				age = me->me_agenext;
				me->me_flags &= ~ME_AGE;
			}
			mutex_exit(&logmap->mtm_mutex);
			return (1); /* failure */
		} else {
			/* add mapentry to age ordered list */
			logmap_list_age(&age, me);
		}
	}
	if (!age) {
		goto out;
	}

	/*
	 * Mark the deltas as being rolled.
	 */
	for (me = age; me; me = me->me_agenext) {
		me->me_flags |= ME_ROLL;
	}

	/*
	 * Test if all deltas are covered by one valid roll buffer
	 */
	crb = age->me_crb;
	if (crb && !(crb->c_invalid)) {
		for (me = age; me; me = me->me_agenext) {
			if (me->me_crb != crb) {
				crb = NULL;
				break;
			}
		}
		rbp->rb_crb = crb;
	}
out:
	rbp->rb_age = age;

	mutex_exit(&logmap->mtm_mutex);

	ASSERT(((logmap->mtm_debug & MT_SCAN) == 0) ||
		logmap_logscan_debug(logmap, age));
	ASSERT(RW_LOCK_HELD(&logmap->mtm_rwlock));
	return (0); /* success */
}

void
logmap_list_put_roll(mt_map_t *mtm, mapentry_t *age)
{
	mapentry_t	*me;

	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
	mutex_enter(&mtm->mtm_mutex);
	for (me = age; me; me = age) {
		age = me->me_agenext;
		me->me_flags &= ~ME_AGE;
	}
	mutex_exit(&mtm->mtm_mutex);
}

void
logmap_list_put(mt_map_t *mtm, mapentry_t *age)
{
	mapentry_t	*me;

	ASSERT(RW_LOCK_HELD(&mtm->mtm_rwlock));
	mutex_enter(&mtm->mtm_mutex);
	for (me = age; me; me = age) {
		age = me->me_agenext;
		me->me_flags &= ~ME_AGE;
	}
	mutex_exit(&mtm->mtm_mutex);
	rw_exit(&mtm->mtm_rwlock);
}

#define	UFS_RW_BALANCE 2
int ufs_rw_balance = UFS_RW_BALANCE;

/*
 * Check if we need to read the master.
 * The master does not need to be read if the log deltas to the
 * block are for one contiguous set of full disk sectors.
 * Both cylinder group bit maps DT_CG (8K); directory entries (512B);
 * and possibly others should not require master disk reads.
 * Calculate the sector map for writing later.
 */
int
logmap_setup_read(mapentry_t *age, rollbuf_t *rbp)
{
	offset_t mof;
	crb_t *crb;
	mapentry_t *me;
	int32_t nb;
	int i;
	int start_sec, end_sec;
	int read_needed = 0;
	int all_inodes = 1;
	int first_sec = INT_MAX;
	int last_sec = -1;
	rbsecmap_t secmap = 0;

	/* LINTED: warning: logical expression always true: op "||" */
	ASSERT((MAPBLOCKSIZE / DEV_BSIZE) == (sizeof (secmap) * NBBY));

	for (me = age; me; me = me->me_agenext) {
		crb = me->me_crb;
		if (crb) {
			nb = crb->c_nb;
			mof = crb->c_mof;
		} else {
			nb = me->me_nb;
			mof = me->me_mof;
		}

		/*
		 * If the delta is not sector aligned then
		 * read the whole block.
		 */
		if ((nb & DEV_BMASK) || (mof & DEV_BMASK)) {
			read_needed = 1;
		}

		/* Set sector map used in the MAPBLOCKSIZE block.  */
		start_sec = (mof & MAPBLOCKOFF) >> DEV_BSHIFT;
		end_sec = start_sec + ((nb - 1) >> DEV_BSHIFT);
		for (i = start_sec; i <= end_sec; i++) {
			secmap |= UINT16_C(1) << i;
		}

		if (me->me_dt != DT_INODE) {
			all_inodes = 0;
		}
		if (start_sec < first_sec) {
			first_sec = start_sec;
		}
		if (end_sec > last_sec) {
			last_sec = end_sec;
		}
	}

	ASSERT(secmap);
	ASSERT(first_sec != INT_MAX);
	ASSERT(last_sec != -1);

	if (all_inodes) {
		/*
		 * Here we have a tradeoff choice. It must be better to
		 * do 2 writes * in the same MAPBLOCKSIZE chunk, than a
		 * read and a write. But what about 3 or more writes, versus
		 * a read+write? * Where is the cut over? It will depend on
		 * the track caching, scsi driver and other activity.
		 * A unpublished tunable is defined (ufs_rw_balance) that
		 * currently defaults to 2.
		 */
		if (!read_needed) {
			int count = 0, gap = 0;
			int sector_set; /* write needed to this sector */

			/* Count the gaps (every 1 to 0 transation) */
			for (i = first_sec + 1; i < last_sec; i++) {
				sector_set = secmap & (UINT16_C(1) << i);
				if (!gap && !sector_set) {
					gap = 1;
					count++;
					if (count > ufs_rw_balance) {
						read_needed = 1;
						break;
					}
				} else if (gap && sector_set) {
					gap = 0;
				}
			}
		}

		/*
		 * Inodes commonly make up the majority (~85%) of deltas.
		 * They cannot contain embedded user data, so its safe to
		 * read and write them all in one IO.
		 * But for directory entries, shadow inode data, and
		 * quota record data the user data fragments can be embedded
		 * betwen those metadata, and so its not safe to read, modify
		 * then write the entire range as user asynchronous user data
		 * writes could get overwritten with old data.
		 * Thus we have to create a segment map of meta data that
		 * needs to get written.
		 *
		 * If user data was logged then this issue would go away.
		 */
		if (read_needed) {
			for (i = first_sec + 1; i < last_sec; i++) {
				secmap |= (UINT16_C(1) << i);
			}
		}
	}
	rbp->rb_secmap = secmap;
	return (read_needed);
}

/*
 * Abort the load of a set of log map delta's.
 * ie,
 * Clear out all mapentries on this unit's log map
 * which have a tid (transaction id) equal to the
 * parameter tid.   Walk the cancel list, taking everything
 * off it, too.
 */
static void
logmap_abort(ml_unit_t *ul, uint32_t tid)
{
	struct mt_map	*mtm = ul->un_logmap;	/* Log map */
	mapentry_t	*me,
			**mep;
	int		i;

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

	/*
	 * wait for any outstanding reads to finish; lock out future reads
	 */
	rw_enter(&mtm->mtm_rwlock, RW_WRITER);

	mutex_enter(&mtm->mtm_mutex);
	/* Take everything off cancel list */
	while ((me = mtm->mtm_cancel) != NULL) {
		mtm->mtm_cancel = me->me_cancel;
		me->me_flags &= ~ME_CANCEL;
		me->me_cancel = NULL;
	}

	/*
	 * Now take out all mapentries with current tid, and committid
	 * as this function is called from logmap_logscan and logmap_commit
	 * When it is called from logmap_logscan mtm_tid == mtm_committid
	 * But when logmap_abort is called from logmap_commit it is
	 * because the log errored when trying to write the commit record,
	 * after the async ops have been allowed to start in top_end_sync.
	 * So we also need to remove all mapentries from the transaction whose
	 * commit failed.
	 */
	for (i = 0; i < mtm->mtm_nhash; i++) {
		mep = &mtm->mtm_hash[i];
		while ((me = *mep) != NULL) {
			if (me->me_tid == tid ||
				me->me_tid == mtm->mtm_committid) {
				*mep = me->me_hash;
				me->me_next->me_prev = me->me_prev;
				me->me_prev->me_next = me->me_next;
				if (!(me->me_flags & ME_USER)) {
					mtm->mtm_nme--;
				}
				CRB_RELE(me);
				kmem_cache_free(mapentry_cache, me);
				continue;
			}
			mep = &me->me_hash;
		}
	}

	if (!(ul->un_flags & LDL_SCAN))
		mtm->mtm_flags |= MTM_CANCELED;
	mutex_exit(&mtm->mtm_mutex);
	mtm->mtm_dirty = 0;
	mtm->mtm_nmet = 0;
	rw_exit(&mtm->mtm_rwlock);

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));
}

static void
logmap_wait_space(mt_map_t *mtm, ml_unit_t *ul, mapentry_t *me)
{
	ASSERT(MUTEX_HELD(&ul->un_log_mutex));

	while (!ldl_has_space(ul, me)) {
		ASSERT(!(ul->un_flags & LDL_NOROLL));
		mutex_exit(&ul->un_log_mutex);
		logmap_forceroll(mtm);
		mutex_enter(&ul->un_log_mutex);
		if (ul->un_flags & LDL_ERROR)
			break;
	}

	ASSERT(MUTEX_HELD(&ul->un_log_mutex));
}

/*
 * put a list of deltas into a logmap
 * If va == NULL, don't write to the log.
 */
void
logmap_add(
	ml_unit_t *ul,
	char *va,			/* Ptr to buf w/deltas & data */
	offset_t vamof,			/* Offset on master of buf start */
	mapentry_t *melist)		/* Entries to add */
{
	offset_t	mof;
	off_t		nb;
	mapentry_t	*me;
	mapentry_t	**mep;
	mapentry_t	**savmep;
	uint32_t	tid;
	mt_map_t	*mtm	= ul->un_logmap;

	mutex_enter(&ul->un_log_mutex);
	if (va)
		logmap_wait_space(mtm, ul, melist);

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

	mtm->mtm_ref = 1;
	mtm->mtm_dirty++;
	tid = mtm->mtm_tid;
	while (melist) {
		mof = melist->me_mof;
		nb  = melist->me_nb;

		/*
		 * search for overlaping entries
		 */
		savmep = mep = MAP_HASH(mof, mtm);
		mutex_enter(&mtm->mtm_mutex);
		while ((me = *mep) != 0) {
			/*
			 * Data consumes old map entry; cancel map entry.
			 * Take care when we replace an old map entry
			 * which carries quota information with a newer entry
			 * which does not. In that case the push function
			 * would not be called to clean up the dquot structure.
			 * This would be found later by invalidatedq() causing
			 * a panic when the filesystem in unmounted.
			 * We clean up the dquot manually and then replace
			 * the map entry.
			 */
			if (MEwithinDATA(me, mof, nb) &&
			    ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
				if (tid == me->me_tid &&
				    ((me->me_flags & ME_AGE) == 0)) {
					*mep = me->me_hash;
					me->me_next->me_prev = me->me_prev;
					me->me_prev->me_next = me->me_next;
					ASSERT(!(me->me_flags & ME_USER));
					mtm->mtm_nme--;
					/*
					 * Special case if the mapentry
					 * carries a dquot and a push function.
					 * We have to clean up the quota info
					 * before replacing the mapentry.
					 */
					if (me->me_dt == DT_QR)
						HANDLE_DQUOT(me, melist);

					kmem_cache_free(mapentry_cache, me);
					continue;
				}
				me->me_cancel = mtm->mtm_cancel;
				mtm->mtm_cancel = me;
				me->me_flags |= ME_CANCEL;
			}
			mep = &(*mep)->me_hash;
		}
		mutex_exit(&mtm->mtm_mutex);

		/*
		 * remove from list
		 */
		me = melist;
		melist = melist->me_hash;
		me->me_flags &= ~ME_LIST;
		/*
		 * If va != NULL, put in the log.
		 */
		if (va)
			ldl_write(ul, va, vamof, me);
		if (ul->un_flags & LDL_ERROR) {
			kmem_cache_free(mapentry_cache, me);
			continue;
		}
		ASSERT((va == NULL) ||
			((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
			map_check_ldl_write(ul, va, vamof, me));

		/*
		 * put on hash
		 */
		mutex_enter(&mtm->mtm_mutex);
		me->me_hash = *savmep;
		*savmep = me;
		me->me_next = (mapentry_t *)mtm;
		me->me_prev = mtm->mtm_prev;
		mtm->mtm_prev->me_next = me;
		mtm->mtm_prev = me;
		me->me_flags |= ME_HASH;
		me->me_tid = tid;
		me->me_age = mtm->mtm_age++;
		mtm->mtm_nme++;
		mtm->mtm_nmet++;
		mutex_exit(&mtm->mtm_mutex);
	}

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));
	mutex_exit(&ul->un_log_mutex);
}

/*
 * Add the delta(s) into the log.
 * Create one cached roll buffer logmap entry, and reference count the
 * number of mapentries refering to it.
 * Cancel previous logmap entries.
 * logmap_add is tolerant of failure to allocate a cached roll buffer.
 */
void
logmap_add_buf(
	ml_unit_t *ul,
	char *va,			/* Ptr to buf w/deltas & data */
	offset_t bufmof,		/* Offset on master of buf start */
	mapentry_t *melist,		/* Entries to add */
	caddr_t	buf,			/* Buffer containing delta(s) */
	uint32_t bufsz)			/* Size of buf */
{
	offset_t	mof;
	offset_t	vamof = bufmof + (va - buf);
	off_t		nb;
	mapentry_t	*me;
	mapentry_t	**mep;
	mapentry_t	**savmep;
	uint32_t	tid;
	mt_map_t	*mtm	= ul->un_logmap;
	crb_t		*crb;
	crb_t		*crbsav = NULL;

	ASSERT((bufsz & DEV_BMASK) == 0);
	mutex_enter(&ul->un_log_mutex);
	logmap_wait_space(mtm, ul, melist);

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

	mtm->mtm_ref = 1;
	mtm->mtm_dirty++;
	tid = mtm->mtm_tid;
	while (melist) {
		mof = melist->me_mof;
		nb  = melist->me_nb;

		/*
		 * search for overlapping entries
		 */
		savmep = mep = MAP_HASH(mof, mtm);
		mutex_enter(&mtm->mtm_mutex);
		while ((me = *mep) != 0) {
			/*
			 * Data consumes old map entry; cancel map entry.
			 * Take care when we replace an old map entry
			 * which carries quota information with a newer entry
			 * which does not. In that case the push function
			 * would not be called to clean up the dquot structure.
			 * This would be found later by invalidatedq() causing
			 * a panic when the filesystem in unmounted.
			 * We clean up the dquot manually and then replace
			 * the map entry.
			 */
			crb = me->me_crb;
			if (MEwithinDATA(me, mof, nb) &&
			    ((me->me_flags & (ME_ROLL|ME_CANCEL)) == 0)) {
				if (tid == me->me_tid &&
				    ((me->me_flags & ME_AGE) == 0)) {
					*mep = me->me_hash;
					me->me_next->me_prev = me->me_prev;
					me->me_prev->me_next = me->me_next;
					ASSERT(!(me->me_flags & ME_USER));
					mtm->mtm_nme--;
					/*
					 * Special case if the mapentry
					 * carries a dquot and a push function.
					 * We have to clean up the quota info
					 * before replacing the mapentry.
					 */
					if (me->me_dt == DT_QR)
						HANDLE_DQUOT(me, melist);

					/*
					 * If this soon to be deleted mapentry
					 * has a suitable roll buffer then
					 * re-use it.
					 */
					if (crb && (--crb->c_refcnt == 0)) {
						if (crbsav ||
						    (crb->c_nb != bufsz)) {
							CRB_FREE(crb, me);
						} else {
							bcopy(buf, crb->c_buf,
							    bufsz);
							crb->c_invalid = 0;
							crb->c_mof = bufmof;
							crbsav = crb;
							me->me_crb = NULL;
						}
					}
					kmem_cache_free(mapentry_cache, me);
					continue;
				}
				me->me_cancel = mtm->mtm_cancel;
				mtm->mtm_cancel = me;
				me->me_flags |= ME_CANCEL;
			}

			/*
			 * Inode deltas within the same fs block come
			 * in individually as separate calls to logmap_add().
			 * All others come in as one call. So check for an
			 * existing entry where we can re-use the crb.
			 */
			if ((me->me_dt == DT_INODE) && (tid == me->me_tid) &&
			    !crbsav && crb &&
			    WITHIN(mof, nb, crb->c_mof, crb->c_nb)) {
				ASSERT(crb->c_mof == bufmof);
				ASSERT(crb->c_nb == bufsz);
				bcopy(buf, crb->c_buf, bufsz);
				crbsav = crb;
			}
			mep = &(*mep)->me_hash;
		}
		mutex_exit(&mtm->mtm_mutex);

		/*
		 * If we don't already have a crb then allocate one
		 * and copy the incoming buffer. Only do this once
		 * for all the incoming deltas.
		 */
		if ((crbsav == NULL) && (melist->me_dt != DT_ABZERO)) {
			/*
			 * Only use a cached roll buffer if we
			 * have enough memory, and check for failures.
			 */
			if (((ufs_crb_size + bufsz) < ufs_crb_limit) &&
			    (kmem_avail() > bufsz)) {
				crbsav = kmem_alloc(sizeof (crb_t), KM_NOSLEEP);
			} else {
				ufs_crb_alloc_fails++;
			}
			if (crbsav) {
				crbsav->c_buf = kmem_alloc(bufsz, KM_NOSLEEP);
				if (crbsav->c_buf) {
					atomic_add_64(&ufs_crb_size,
					    (uint64_t)bufsz);
					if (ufs_crb_size > ufs_crb_max_size) {
						ufs_crb_max_size = ufs_crb_size;
					}
					bcopy(buf, crbsav->c_buf, bufsz);
					crbsav->c_nb = bufsz;
					crbsav->c_refcnt = 0;
					crbsav->c_invalid = 0;
					ASSERT((bufmof & DEV_BMASK) == 0);
					crbsav->c_mof = bufmof;
				} else {
					kmem_free(crbsav, sizeof (crb_t));
					crbsav = NULL;
				}
			}
		}

		/*
		 * remove from list
		 */
		me = melist;
		melist = melist->me_hash;
		me->me_flags &= ~ME_LIST;
		me->me_crb = crbsav;
		if (crbsav) {
			crbsav->c_refcnt++;
		}
		crbsav = NULL;

		ASSERT(va);
		ldl_write(ul, va, vamof, me); /* add to on-disk log */
		if (ul->un_flags & LDL_ERROR) {
			CRB_RELE(me);
			kmem_cache_free(mapentry_cache, me);
			continue;
		}
		ASSERT(((mtm->mtm_debug & MT_LOG_WRITE_CHECK) == 0) ||
			map_check_ldl_write(ul, va, vamof, me));

		/*
		 * put on hash
		 */
		mutex_enter(&mtm->mtm_mutex);
		me->me_hash = *savmep;
		*savmep = me;
		me->me_next = (mapentry_t *)mtm;
		me->me_prev = mtm->mtm_prev;
		mtm->mtm_prev->me_next = me;
		mtm->mtm_prev = me;
		me->me_flags |= ME_HASH;
		me->me_tid = tid;
		me->me_age = mtm->mtm_age++;
		mtm->mtm_nme++;
		mtm->mtm_nmet++;
		mutex_exit(&mtm->mtm_mutex);
	}

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));
	mutex_exit(&ul->un_log_mutex);
}

/*
 * free up any cancelled deltas
 */
void
logmap_free_cancel(mt_map_t *mtm, mapentry_t **cancelhead)
{
	int		dolock	= 0;
	mapentry_t	*me;
	mapentry_t	**mep;

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

again:
	if (dolock)
		rw_enter(&mtm->mtm_rwlock, RW_WRITER);

	/*
	 * At EOT, cancel the indicated deltas
	 */
	mutex_enter(&mtm->mtm_mutex);
	if (mtm->mtm_flags & MTM_CANCELED) {
		mtm->mtm_flags &= ~MTM_CANCELED;
		ASSERT(dolock == 0);
		mutex_exit(&mtm->mtm_mutex);
		return;
	}

	while ((me = *cancelhead) != NULL) {
		/*
		 * roll forward or read collision; wait and try again
		 */
		if (me->me_flags & ME_AGE) {
			ASSERT(dolock == 0);
			mutex_exit(&mtm->mtm_mutex);
			dolock = 1;
			goto again;
		}
		/*
		 * remove from cancel list
		 */
		*cancelhead = me->me_cancel;
		me->me_cancel = NULL;
		me->me_flags &= ~(ME_CANCEL);

		/*
		 * logmap_remove_roll handles ME_ROLL entries later
		 *	we leave them around for logmap_iscancel
		 *	XXX is this necessary?
		 */
		if (me->me_flags & ME_ROLL)
			continue;

		/*
		 * remove from hash (if necessary)
		 */
		if (me->me_flags & ME_HASH) {
			mep = MAP_HASH(me->me_mof, mtm);
			while (*mep) {
				if (*mep == me) {
					*mep = me->me_hash;
					me->me_next->me_prev = me->me_prev;
					me->me_prev->me_next = me->me_next;
					me->me_flags &= ~(ME_HASH);
					if (!(me->me_flags & ME_USER)) {
						mtm->mtm_nme--;
					}
					break;
				} else
					mep = &(*mep)->me_hash;
			}
		}
		/*
		 * put the entry on the free list
		 */
		CRB_RELE(me);
		kmem_cache_free(mapentry_cache, me);
	}
	mutex_exit(&mtm->mtm_mutex);
	if (dolock)
		rw_exit(&mtm->mtm_rwlock);

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));
}


void
logmap_commit(ml_unit_t *ul, uint32_t tid)
{
	mapentry_t	me;
	mt_map_t	*mtm	= ul->un_logmap;


	ASSERT(MUTEX_HELD(&ul->un_log_mutex));

	/*
	 * async'ly write a commit rec into the log
	 */
	if (mtm->mtm_dirty) {
		/*
		 * put commit record into log
		 */
		me.me_mof = mtm->mtm_tid;
		me.me_dt = DT_COMMIT;
		me.me_nb = 0;
		me.me_hash = NULL;
		logmap_wait_space(mtm, ul, &me);
		ldl_write(ul, NULL, (offset_t)0, &me);
		ldl_round_commit(ul);

		/*
		 * abort on error; else reset dirty flag
		 */
		if (ul->un_flags & LDL_ERROR)
			logmap_abort(ul, tid);
		else {
			mtm->mtm_dirty = 0;
			mtm->mtm_nmet = 0;
			mtm->mtm_cfrags = 0;
		}
		/* push commit */
		ldl_push_commit(ul);
	}
}

void
logmap_sethead(mt_map_t *mtm, ml_unit_t *ul)
{
	off_t		lof;
	uint32_t	tid;
	mapentry_t	*me;

	/*
	 * move the head forward so the log knows how full it is
	 * Make sure to skip any mapentry whose me_lof is 0, these
	 * are just place holders for DT_CANCELED freed user blocks
	 * for the current moby.
	 */
	mutex_enter(&ul->un_log_mutex);
	mutex_enter(&mtm->mtm_mutex);
	me = mtm->mtm_next;
	while (me != (mapentry_t *)mtm && me->me_lof == 0) {
		me = me->me_next;
	}

	if (me == (mapentry_t *)mtm)
		lof = -1;
	else {
		lof = me->me_lof;
		tid = me->me_tid;
	}
	mutex_exit(&mtm->mtm_mutex);
	ldl_sethead(ul, lof, tid);
	if (lof == -1)
		mtm->mtm_age = 0;
	mutex_exit(&ul->un_log_mutex);
}

void
logmap_settail(mt_map_t *mtm, ml_unit_t *ul)
{
	off_t		lof;
	size_t		nb;

	/*
	 * set the tail after the logmap_abort
	 */
	mutex_enter(&ul->un_log_mutex);
	mutex_enter(&mtm->mtm_mutex);
	if (mtm->mtm_prev == (mapentry_t *)mtm)
		lof = -1;
	else {
		/*
		 * set the tail to the end of the last commit
		 */
		lof = mtm->mtm_tail_lof;
		nb = mtm->mtm_tail_nb;
	}
	mutex_exit(&mtm->mtm_mutex);
	ldl_settail(ul, lof, nb);
	mutex_exit(&ul->un_log_mutex);
}

/*
 * when reseting a device; roll the log until every
 * delta has been rolled forward
 */
void
logmap_roll_dev(ml_unit_t *ul)
{
	mt_map_t	*mtm	= ul->un_logmap;
	mapentry_t	*me;
	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;

again:
	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));
	if (ul->un_flags & (LDL_ERROR|LDL_NOROLL))
		return;

	/*
	 * look for deltas
	 */
	mutex_enter(&mtm->mtm_mutex);
	for (me = mtm->mtm_next; me != (mapentry_t *)mtm; me = me->me_next) {
		if (me->me_flags & ME_ROLL)
			break;
		if (me->me_tid == mtm->mtm_tid)
			continue;
		if (me->me_tid == mtm->mtm_committid)
			continue;
		break;
	}

	/*
	 * found a delta; kick the roll thread
	 * but only if the thread is running... (jmh)
	 */
	if (me != (mapentry_t *)mtm) {
		mutex_exit(&mtm->mtm_mutex);
		logmap_forceroll(mtm);
		goto again;
	}

	/*
	 * no more deltas, return
	 */
	mutex_exit(&mtm->mtm_mutex);
	(void) ufs_putsummaryinfo(ul->un_dev, ufsvfsp, ufsvfsp->vfs_fs);

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));
}

static void
logmap_cancel_delta(ml_unit_t *ul, offset_t mof, int32_t nb, int metadata)
{
	mapentry_t	*me;
	mapentry_t	**mep;
	mt_map_t	*mtm	= ul->un_logmap;
	int		frags;

	/*
	 * map has been referenced and is dirty
	 */
	mtm->mtm_ref = 1;
	mtm->mtm_dirty++;

	/*
	 * get a mapentry
	 */
	me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
	bzero(me, sizeof (mapentry_t));

	/*
	 * initialize cancel record and put in logmap
	 */
	me->me_mof = mof;
	me->me_nb = nb;
	me->me_dt = DT_CANCEL;
	me->me_tid = mtm->mtm_tid;
	me->me_hash = NULL;

	/*
	 * Write delta to log if this delta is for metadata.  If this is not
	 * metadata it is user data and we are just putting a cancel
	 * mapentry into the hash to cancel a user block deletion
	 * in which we do not want the block to be allocated
	 * within this moby.  This cancel entry will prevent the block from
	 * being allocated within the moby and prevent user data corruption
	 * if we happen to crash before this moby is committed.
	 */
	mutex_enter(&ul->un_log_mutex);
	if (metadata) {
		logmap_wait_space(mtm, ul, me);
		ldl_write(ul, NULL, (offset_t)0, me);
		if (ul->un_flags & LDL_ERROR) {
			kmem_cache_free(mapentry_cache, me);
			mutex_exit(&ul->un_log_mutex);
			return;
		}
	}

	/*
	 * put in hash and on cancel list
	 */
	mep = MAP_HASH(mof, mtm);
	mutex_enter(&mtm->mtm_mutex);
	me->me_age = mtm->mtm_age++;
	me->me_hash = *mep;
	*mep = me;
	me->me_next = (mapentry_t *)mtm;
	me->me_prev = mtm->mtm_prev;
	mtm->mtm_prev->me_next = me;
	mtm->mtm_prev = me;
	me->me_cancel = mtm->mtm_cancel;
	mtm->mtm_cancel = me;
	if (metadata) {
		mtm->mtm_nme++;
		mtm->mtm_nmet++;
	} else {
		me->me_flags = ME_USER;
	}
	me->me_flags |= (ME_HASH|ME_CANCEL);
	if (!(metadata)) {
		frags = blkoff(ul->un_ufsvfs->vfs_fs, nb);
		if (frags)
			mtm->mtm_cfrags += numfrags(ul->un_ufsvfs->vfs_fs,
				frags);
	}
	mutex_exit(&mtm->mtm_mutex);

	mutex_exit(&ul->un_log_mutex);
}

/*
 * cancel entries in a logmap (entries are freed at EOT)
 */
void
logmap_cancel(ml_unit_t *ul, offset_t mof, off_t nb, int metadata)
{
	int32_t		hnb;
	mapentry_t	*me;
	mapentry_t	**mep;
	mt_map_t	*mtm	= ul->un_logmap;
	crb_t		*crb;

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));

	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
		if (hnb > nb)
			hnb = nb;
		/*
		 * Find overlapping metadata entries.  Don't search through
		 * the hash chains if this is user data because it is only
		 * possible to have overlapping map entries for metadata,
		 * and the search can become expensive for large files.
		 */
		if (metadata) {
			mep = MAP_HASH(mof, mtm);
			mutex_enter(&mtm->mtm_mutex);
			for (me = *mep; me; me = me->me_hash) {
				if (!DATAoverlapME(mof, hnb, me))
					continue;

				ASSERT(MEwithinDATA(me, mof, hnb));

				if ((me->me_flags & ME_CANCEL) == 0) {
					me->me_cancel = mtm->mtm_cancel;
					mtm->mtm_cancel = me;
					me->me_flags |= ME_CANCEL;
					crb = me->me_crb;
					if (crb) {
						crb->c_invalid = 1;
					}
				}
			}
			mutex_exit(&mtm->mtm_mutex);
		}

		/*
		 * put a cancel record into the log
		 */
		logmap_cancel_delta(ul, mof, hnb, metadata);
	}

	ASSERT(((mtm->mtm_debug & MT_CHECK_MAP) == 0) ||
		map_check_linkage(mtm));
}

/*
 * check for overlap w/cancel delta
 */
int
logmap_iscancel(mt_map_t *mtm, offset_t mof, off_t nb)
{
	off_t		hnb;
	mapentry_t	*me;
	mapentry_t	**mep;

	mutex_enter(&mtm->mtm_mutex);
	for (hnb = 0; nb; nb -= hnb, mof += hnb) {
		hnb = MAPBLOCKSIZE - (mof & MAPBLOCKOFF);
		if (hnb > nb)
			hnb = nb;
		/*
		 * search for dup entry
		 */
		mep = MAP_HASH(mof, mtm);
		for (me = *mep; me; me = me->me_hash) {
			if (((me->me_flags & ME_ROLL) == 0) &&
			    (me->me_dt != DT_CANCEL))
				continue;
			if (DATAoverlapME(mof, hnb, me))
				break;
		}

		/*
		 * overlap detected
		 */
		if (me) {
			mutex_exit(&mtm->mtm_mutex);
			return (1);
		}
	}
	mutex_exit(&mtm->mtm_mutex);
	return (0);
}

static int
logmap_logscan_add(ml_unit_t *ul, struct delta *dp, off_t lof, size_t *nbp)
{
	mapentry_t	*me;
	int		error;
	mt_map_t	*mtm	= ul->un_logmap;

	/*
	 * verify delta header; failure == mediafail
	 */
	error = 0;
	/* delta type */
	if ((dp->d_typ <= DT_NONE) || (dp->d_typ >= DT_MAX))
		error = EINVAL;
	if (dp->d_typ == DT_COMMIT) {
		if (dp->d_nb != INT32_C(0) && dp->d_nb != INT32_C(-1))
			error = EINVAL;
	} else {
		/* length of delta */
		if ((dp->d_nb < INT32_C(0)) ||
		    (dp->d_nb > INT32_C(MAPBLOCKSIZE)))
			error = EINVAL;

		/* offset on master device */
		if (dp->d_mof < INT64_C(0))
			error = EINVAL;
	}

	if (error) {
		ldl_seterror(ul, "Error processing ufs log data during scan");
		return (error);
	}

	/*
	 * process commit record
	 */
	if (dp->d_typ == DT_COMMIT) {
		if (mtm->mtm_dirty) {
			ASSERT(dp->d_nb == INT32_C(0));
			logmap_free_cancel(mtm, &mtm->mtm_cancel);
			mtm->mtm_dirty = 0;
			mtm->mtm_nmet = 0;
			mtm->mtm_tid++;
			mtm->mtm_committid = mtm->mtm_tid;
			ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
				logmap_logscan_commit_debug(lof, mtm));
		}
		/*
		 * return #bytes to next sector (next delta header)
		 */
		*nbp = ldl_logscan_nbcommit(lof);
		mtm->mtm_tail_lof = lof;
		mtm->mtm_tail_nb = *nbp;
		return (0);
	}

	/*
	 * add delta to logmap
	 */
	me = kmem_cache_alloc(mapentry_cache, KM_SLEEP);
	bzero(me, sizeof (mapentry_t));
	me->me_lof = lof;
	me->me_mof = dp->d_mof;
	me->me_nb = dp->d_nb;
	me->me_tid = mtm->mtm_tid;
	me->me_dt = dp->d_typ;
	me->me_hash = NULL;
	me->me_flags = (ME_LIST | ME_SCAN);
	logmap_add(ul, NULL, 0, me);
	switch (dp->d_typ) {
	case DT_CANCEL:
		me->me_flags |= ME_CANCEL;
		me->me_cancel = mtm->mtm_cancel;
		mtm->mtm_cancel = me;
		break;
	default:
		ASSERT(((mtm->mtm_debug & MT_SCAN) == 0) ||
			logmap_logscan_add_debug(dp, mtm));
		break;
	}

sizeofdelta:
	/*
	 * return #bytes till next delta header
	 */
	if ((dp->d_typ == DT_CANCEL) || (dp->d_typ == DT_ABZERO))
		*nbp = 0;
	else
		*nbp = dp->d_nb;
	return (0);
}

void
logmap_logscan(ml_unit_t *ul)
{
	size_t		nb, nbd;
	off_t		lof;
	struct delta	delta;
	mt_map_t	*logmap	= ul->un_logmap;

	ASSERT(ul->un_deltamap->mtm_next == (mapentry_t *)ul->un_deltamap);

	/*
	 * prepare the log for a logscan
	 */
	ldl_logscan_begin(ul);

	/*
	 * prepare the logmap for a logscan
	 */
	(void) map_free_entries(logmap);
	logmap->mtm_tid = 0;
	logmap->mtm_committid = UINT32_C(0);
	logmap->mtm_age = 0;
	logmap->mtm_dirty = 0;
	logmap->mtm_ref = 0;

	/*
	 * while not at end of log
	 *	read delta header
	 *	add to logmap
	 *	seek to beginning of next delta
	 */
	lof = ul->un_head_lof;
	nbd = sizeof (delta);
	while (lof != ul->un_tail_lof) {

		/* read delta header */
		if (ldl_logscan_read(ul, &lof, nbd, (caddr_t)&delta))
			break;

		/* add to logmap */
		if (logmap_logscan_add(ul, &delta, lof, &nb))
			break;

		/* seek to next header (skip data) */
		if (ldl_logscan_read(ul, &lof, nb, NULL))
			break;
	}

	/*
	 * remove the last partial transaction from the logmap
	 */
	logmap_abort(ul, logmap->mtm_tid);

	ldl_logscan_end(ul);
}

void
_init_map(void)
{
	/*
	 * Initialise the mapentry cache. No constructor or deconstructor
	 * is needed. Also no reclaim function is supplied as reclaiming
	 * current entries is not possible.
	 */
	mapentry_cache = kmem_cache_create("lufs_mapentry_cache",
	    sizeof (mapentry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
}

/*
 * Special case when we replace an old map entry which carries quota
 * information with a newer entry which does not.
 * In that case the push function would not be called to clean up the
 * dquot structure. This would be found later by invalidatedq() causing
 * a panic when the filesystem in unmounted.
 * We clean up the dquot manually before replacing the map entry.
 */
void
handle_dquot(mapentry_t *me)
{
	int dolock = 0;
	int domutex = 0;
	struct dquot *dqp;

	dqp = (struct dquot *)me->me_arg;

	/*
	 * We need vfs_dqrwlock to call dqput()
	 */
	dolock = (!RW_LOCK_HELD(&dqp->dq_ufsvfsp->vfs_dqrwlock));
	if (dolock)
		rw_enter(&dqp->dq_ufsvfsp->vfs_dqrwlock, RW_READER);

	domutex = (!MUTEX_HELD(&dqp->dq_lock));
	if (domutex)
		mutex_enter(&dqp->dq_lock);

	/*
	 * Only clean up if the dquot is referenced
	 */
	if (dqp->dq_cnt == 0) {
		if (domutex)
			mutex_exit(&dqp->dq_lock);
		if (dolock)
			rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);
		return;
	}

	dqp->dq_flags &= ~(DQ_MOD|DQ_TRANS);
	dqput(dqp);

	if (domutex)
		mutex_exit(&dqp->dq_lock);

	if (dolock)
		rw_exit(&dqp->dq_ufsvfsp->vfs_dqrwlock);

}