/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#pragma ident	"%Z%%M%	%I%	%E% SMI"

#include <sys/systm.h>
#include <sys/types.h>
#include <sys/vnode.h>
#include <sys/errno.h>
#include <sys/sysmacros.h>
#include <sys/debug.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/proc.h>
#include <sys/cmn_err.h>
#include <sys/fssnap_if.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_filio.h>
#include <sys/fs/ufs_log.h>
#include <sys/fs/ufs_bio.h>
#include <sys/atomic.h>

extern int		maxphys;
extern uint_t		bypass_snapshot_throttle_key;

extern struct kmem_cache	*lufs_sv;
extern struct kmem_cache	*lufs_bp;

static void
makebusy(ml_unit_t *ul, buf_t *bp)
{
	sema_p(&bp->b_sem);
	if ((bp->b_flags & B_ERROR) == 0)
		return;
	if (bp->b_flags & B_READ)
		ldl_seterror(ul, "Error reading ufs log");
	else
		ldl_seterror(ul, "Error writing ufs log");
}

static int
logdone(buf_t *bp)
{
	bp->b_flags |= B_DONE;

	if (bp->b_flags & B_WRITE)
		sema_v(&bp->b_sem);
	else
		/* wakeup the thread waiting on this buf */
		sema_v(&bp->b_io);
	return (0);
}

static int
ldl_strategy_done(buf_t *cb)
{
	lufs_save_t	*sv;
	lufs_buf_t	*lbp;
	buf_t		*bp;

	ASSERT(SEMA_HELD(&cb->b_sem));
	ASSERT((cb->b_flags & B_DONE) == 0);

	/*
	 * Compute address of the ``save'' struct
	 */
	lbp = (lufs_buf_t *)cb;
	sv = (lufs_save_t *)lbp->lb_ptr;

	if (cb->b_flags & B_ERROR)
		sv->sv_error = 1;

	/*
	 * If this is the last request, release the resources and
	 * ``done'' the original buffer header.
	 */
	if (atomic_add_long_nv(&sv->sv_nb_left, -cb->b_bcount)) {
		kmem_cache_free(lufs_bp, lbp);
		return (1);
	}
	/* Propagate any errors back to the original buffer header */
	bp = sv->sv_bp;
	if (sv->sv_error)
		bp->b_flags |= B_ERROR;
	kmem_cache_free(lufs_bp, lbp);
	kmem_cache_free(lufs_sv, sv);

	biodone(bp);
	return (0);
}

/*
 * Map the log logical block number to a physical disk block number
 */
static int
map_frag(
	ml_unit_t	*ul,
	daddr_t		lblkno,
	size_t		bcount,
	daddr_t		*pblkno,
	size_t		*pbcount)
{
	ic_extent_t	*ext = ul->un_ebp->ic_extents;
	uint32_t	e = ul->un_ebp->ic_nextents;
	uint32_t	s = 0;
	uint32_t	i = e >> 1;
	uint32_t	lasti = i;
	uint32_t	bno_off;

again:
	if (ext[i].ic_lbno <= lblkno) {
		if ((ext[i].ic_lbno + ext[i].ic_nbno) > lblkno) {
			/* FOUND IT */
			bno_off = lblkno - (uint32_t)ext[i].ic_lbno;
			*pbcount = MIN(bcount, dbtob(ext[i].ic_nbno - bno_off));
			*pblkno = ext[i].ic_pbno + bno_off;
			return (0);
		} else
			s = i;
	} else
		e = i;
	i = s + ((e - s) >> 1);

	if (i == lasti) {
		*pbcount = bcount;
		return (ENOENT);
	}
	lasti = i;

	goto again;
}

/*
 * The log is a set of extents (which typically will be only one, but
 * may be more if the disk was close to full when the log was created)
 * and hence the logical offsets into the log
 * have to be translated into their real device locations before
 * calling the device's strategy routine. The translation may result
 * in several IO requests if this request spans extents.
 */
void
ldl_strategy(ml_unit_t *ul, buf_t *pb)
{
	lufs_save_t	*sv;
	lufs_buf_t	*lbp;
	buf_t		*cb;
	ufsvfs_t	*ufsvfsp = ul->un_ufsvfs;
	daddr_t		lblkno, pblkno;
	size_t		nb_left, pbcount;
	off_t		offset;
	dev_t		dev	= ul->un_dev;
	int		error;
	int		read = pb->b_flags & B_READ;

	/*
	 * Allocate and initialise the save stucture,
	 */
	sv = kmem_cache_alloc(lufs_sv, KM_SLEEP);
	sv->sv_error = 0;
	sv->sv_bp = pb;
	nb_left = pb->b_bcount;
	sv->sv_nb_left = nb_left;

	lblkno = pb->b_blkno;
	offset = 0;

	do {
		error = map_frag(ul, lblkno, nb_left, &pblkno, &pbcount);

		lbp = kmem_cache_alloc(lufs_bp, KM_SLEEP);
		bioinit(&lbp->lb_buf);
		lbp->lb_ptr = sv;

		cb = bioclone(pb, offset, pbcount, dev,
		    pblkno, ldl_strategy_done, &lbp->lb_buf, KM_SLEEP);

		offset += pbcount;
		lblkno += btodb(pbcount);
		nb_left -= pbcount;

		if (error) {
			cb->b_flags |= B_ERROR;
			cb->b_resid = cb->b_bcount;
			biodone(cb);
		} else {
			if (read) {
				logstats.ls_ldlreads.value.ui64++;
				ufsvfsp->vfs_iotstamp = lbolt;
				lwp_stat_update(LWP_STAT_INBLK, 1);
			} else {
				logstats.ls_ldlwrites.value.ui64++;
				lwp_stat_update(LWP_STAT_OUBLK, 1);
			}

			/*
			 * write through the snapshot driver if necessary
			 * We do not want this write to be throttled because
			 * we are holding the un_log mutex here. If we
			 * are throttled in fssnap_translate, the fssnap_taskq
			 * thread which can wake us up can get blocked on
			 * the un_log mutex resulting in a deadlock.
			 */
			if (ufsvfsp->vfs_snapshot) {
				(void) tsd_set(bypass_snapshot_throttle_key,
				    (void *)1);
				fssnap_strategy(&ufsvfsp->vfs_snapshot, cb);

				(void) tsd_set(bypass_snapshot_throttle_key,
				    (void *)0);
			} else {
				(void) bdev_strategy(cb);
			}
		}

	} while (nb_left);
}

static void
writelog(ml_unit_t *ul, buf_t *bp)
{
	ASSERT(SEMA_HELD(&bp->b_sem));

	/*
	 * This is really an B_ASYNC write but we want Presto to
	 * cache this write.  The iodone routine, logdone, processes
	 * the buf correctly.
	 */
	bp->b_flags = B_WRITE;
	bp->b_edev = ul->un_dev;
	bp->b_iodone = logdone;

	/*
	 * return EIO for every IO if in hard error state
	 */
	if (ul->un_flags & LDL_ERROR) {
		bp->b_flags |= B_ERROR;
		bp->b_error = EIO;
		biodone(bp);
		return;
	}

	ldl_strategy(ul, bp);
}

static void
readlog(ml_unit_t *ul, buf_t *bp)
{
	ASSERT(SEMA_HELD(&bp->b_sem));
	ASSERT(bp->b_bcount);

	bp->b_flags = B_READ;
	bp->b_edev = ul->un_dev;
	bp->b_iodone = logdone;

	/* all IO returns errors when in error state */
	if (ul->un_flags & LDL_ERROR) {
		bp->b_flags |= B_ERROR;
		bp->b_error = EIO;
		biodone(bp);
		(void) trans_wait(bp);
		return;
	}

	ldl_strategy(ul, bp);

	if (trans_wait(bp))
		ldl_seterror(ul, "Error reading ufs log");
}

/*
 * NOTE: writers are single threaded thru the log layer.
 * This means we can safely reference and change the cb and bp fields
 * that ldl_read does not reference w/o holding the cb_rwlock or
 * the bp makebusy lock.
 */
static void
push_dirty_bp(ml_unit_t *ul, buf_t *bp)
{
	buf_t		*newbp;
	cirbuf_t	*cb		= &ul->un_wrbuf;

	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);
	ASSERT((bp->b_bcount & (DEV_BSIZE-1)) == 0);

	/*
	 * async write the buf
	 */
	writelog(ul, bp);

	/*
	 * no longer filling any buf
	 */
	cb->cb_dirty = NULL;

	/*
	 * no extra buffer space; all done
	 */
	if (bp->b_bcount == bp->b_bufsize)
		return;

	/*
	 * give extra buffer space to a new bp
	 * 	try to take buf off of free list
	 */
	if ((newbp = cb->cb_free) != NULL) {
		cb->cb_free = newbp->b_forw;
	} else {
		newbp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
		sema_init(&newbp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
		sema_init(&newbp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
	}
	newbp->b_flags = 0;
	newbp->b_bcount = 0;
	newbp->b_file = NULL;
	newbp->b_offset = -1;
	newbp->b_bufsize = bp->b_bufsize - bp->b_bcount;
	newbp->b_un.b_addr = bp->b_un.b_addr + bp->b_bcount;
	bp->b_bufsize = bp->b_bcount;

	/*
	 * lock out readers and put new buf at LRU position
	 */
	rw_enter(&cb->cb_rwlock, RW_WRITER);
	newbp->b_forw = bp->b_forw;
	newbp->b_back = bp;
	bp->b_forw->b_back = newbp;
	bp->b_forw = newbp;
	rw_exit(&cb->cb_rwlock);
}

static void
inval_range(ml_unit_t *ul, cirbuf_t *cb, off_t lof, off_t nb)
{
	buf_t		*bp;
	off_t		elof	= lof + nb;
	off_t		buflof;
	off_t		bufelof;

	/*
	 * discard all bufs that overlap the range (lof, lof + nb)
	 */
	rw_enter(&cb->cb_rwlock, RW_WRITER);
	bp = cb->cb_bp;
	do {
		if (bp == cb->cb_dirty || bp->b_bcount == 0) {
			bp = bp->b_forw;
			continue;
		}
		buflof = dbtob(bp->b_blkno);
		bufelof = buflof + bp->b_bcount;
		if ((buflof < lof && bufelof <= lof) ||
		    (buflof >= elof && bufelof > elof)) {
			bp = bp->b_forw;
			continue;
		}
		makebusy(ul, bp);
		bp->b_flags = 0;
		bp->b_bcount = 0;
		sema_v(&bp->b_sem);
		bp = bp->b_forw;
	} while (bp != cb->cb_bp);
	rw_exit(&cb->cb_rwlock);
}

/*
 * NOTE: writers are single threaded thru the log layer.
 * This means we can safely reference and change the cb and bp fields
 * that ldl_read does not reference w/o holding the cb_rwlock or
 * the bp makebusy lock.
 */
static buf_t *
get_write_bp(ml_unit_t *ul)
{
	cirbuf_t	*cb = &ul->un_wrbuf;
	buf_t		*bp;

	/*
	 * cb_dirty is the buffer we are currently filling; if any
	 */
	if ((bp = cb->cb_dirty) != NULL) {
		makebusy(ul, bp);
		return (bp);
	}
	/*
	 * discard any bp that overlaps the current tail since we are
	 * about to overwrite it.
	 */
	inval_range(ul, cb, ul->un_tail_lof, 1);

	/*
	 * steal LRU buf
	 */
	rw_enter(&cb->cb_rwlock, RW_WRITER);
	bp = cb->cb_bp->b_forw;
	makebusy(ul, bp);

	cb->cb_dirty = bp;
	cb->cb_bp = bp;

	bp->b_flags = 0;
	bp->b_bcount = 0;
	bp->b_blkno = btodb(ul->un_tail_lof);
	ASSERT(dbtob(bp->b_blkno) == ul->un_tail_lof);
	rw_exit(&cb->cb_rwlock);

	/*
	 * NOTE:
	 *	1. un_tail_lof never addresses >= un_eol_lof
	 *	2. b_blkno + btodb(b_bufsize) may > un_eol_lof
	 *		this case is handled in storebuf
	 */
	return (bp);
}

void
alloc_wrbuf(cirbuf_t *cb, size_t bufsize)
{
	int	i;
	buf_t	*bp;

	/*
	 * Clear previous allocation
	 */
	if (cb->cb_nb)
		free_cirbuf(cb);

	bzero(cb, sizeof (*cb));
	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);

	rw_enter(&cb->cb_rwlock, RW_WRITER);

	/*
	 * preallocate 3 bp's and put them on the free list.
	 */
	for (i = 0; i < 3; ++i) {
		bp = kmem_zalloc(sizeof (buf_t), KM_SLEEP);
		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
		bp->b_offset = -1;
		bp->b_forw = cb->cb_free;
		cb->cb_free = bp;
	}

	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
	cb->cb_nb = bufsize;

	/*
	 * first bp claims entire write buffer
	 */
	bp = cb->cb_free;
	cb->cb_free = bp->b_forw;

	bp->b_forw = bp;
	bp->b_back = bp;
	cb->cb_bp = bp;
	bp->b_un.b_addr = cb->cb_va;
	bp->b_bufsize = cb->cb_nb;

	rw_exit(&cb->cb_rwlock);
}

void
alloc_rdbuf(cirbuf_t *cb, size_t bufsize, size_t blksize)
{
	caddr_t	va;
	size_t	nb;
	buf_t	*bp;

	/*
	 * Clear previous allocation
	 */
	if (cb->cb_nb)
		free_cirbuf(cb);

	bzero(cb, sizeof (*cb));
	rw_init(&cb->cb_rwlock, NULL, RW_DRIVER, NULL);

	rw_enter(&cb->cb_rwlock, RW_WRITER);

	cb->cb_va = kmem_alloc(bufsize, KM_SLEEP);
	cb->cb_nb = bufsize;

	/*
	 * preallocate N bufs that are hard-sized to blksize
	 *	in other words, the read buffer pool is a linked list
	 *	of statically sized bufs.
	 */
	va = cb->cb_va;
	while ((nb = bufsize) != 0) {
		if (nb > blksize)
			nb = blksize;
		bp = kmem_alloc(sizeof (buf_t), KM_SLEEP);
		bzero(bp, sizeof (buf_t));
		sema_init(&bp->b_sem, 1, NULL, SEMA_DEFAULT, NULL);
		sema_init(&bp->b_io, 0, NULL, SEMA_DEFAULT, NULL);
		bp->b_un.b_addr = va;
		bp->b_bufsize = nb;
		if (cb->cb_bp) {
			bp->b_forw = cb->cb_bp->b_forw;
			bp->b_back = cb->cb_bp;
			cb->cb_bp->b_forw->b_back = bp;
			cb->cb_bp->b_forw = bp;
		} else
			bp->b_forw = bp->b_back = bp;
		cb->cb_bp = bp;
		bufsize -= nb;
		va += nb;
	}

	rw_exit(&cb->cb_rwlock);
}

void
free_cirbuf(cirbuf_t *cb)
{
	buf_t	*bp;

	if (cb->cb_nb == 0)
		return;

	rw_enter(&cb->cb_rwlock, RW_WRITER);
	ASSERT(cb->cb_dirty == NULL);

	/*
	 * free the active bufs
	 */
	while ((bp = cb->cb_bp) != NULL) {
		if (bp == bp->b_forw)
			cb->cb_bp = NULL;
		else
			cb->cb_bp = bp->b_forw;
		bp->b_back->b_forw = bp->b_forw;
		bp->b_forw->b_back = bp->b_back;
		sema_destroy(&bp->b_sem);
		sema_destroy(&bp->b_io);
		kmem_free(bp, sizeof (buf_t));
	}

	/*
	 * free the free bufs
	 */
	while ((bp = cb->cb_free) != NULL) {
		cb->cb_free = bp->b_forw;
		sema_destroy(&bp->b_sem);
		sema_destroy(&bp->b_io);
		kmem_free(bp, sizeof (buf_t));
	}
	kmem_free(cb->cb_va, cb->cb_nb);
	cb->cb_va = NULL;
	cb->cb_nb = 0;
	rw_exit(&cb->cb_rwlock);
	rw_destroy(&cb->cb_rwlock);
}

static int
within_range(off_t lof, daddr_t blkno, ulong_t bcount)
{
	off_t	blof	= dbtob(blkno);

	return ((lof >= blof) && (lof < (blof + bcount)));
}

static buf_t *
find_bp(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
{
	buf_t *bp;

	/*
	 * find a buf that contains the offset lof
	 */
	rw_enter(&cb->cb_rwlock, RW_READER);
	bp = cb->cb_bp;
	do {
		if (bp->b_bcount &&
		    within_range(lof, bp->b_blkno, bp->b_bcount)) {
			makebusy(ul, bp);
			rw_exit(&cb->cb_rwlock);
			return (bp);
		}
		bp = bp->b_forw;
	} while (bp != cb->cb_bp);
	rw_exit(&cb->cb_rwlock);

	return (NULL);
}

static off_t
find_read_lof(ml_unit_t *ul, cirbuf_t *cb, off_t lof)
{
	buf_t	*bp, *bpend;
	off_t	rlof;

	/*
	 * we mustn't:
	 *	o read past eol
	 *	o read past the tail
	 *	o read data that may be being written.
	 */
	rw_enter(&cb->cb_rwlock, RW_READER);
	bpend = bp = cb->cb_bp->b_forw;
	rlof = ul->un_tail_lof;
	do {
		if (bp->b_bcount) {
			rlof = dbtob(bp->b_blkno);
			break;
		}
		bp = bp->b_forw;
	} while (bp != bpend);
	rw_exit(&cb->cb_rwlock);

	if (lof <= rlof)
		/* lof is prior to the range represented by the write buf */
		return (rlof);
	else
		/* lof follows the range represented by the write buf */
		return ((off_t)ul->un_eol_lof);
}

static buf_t *
get_read_bp(ml_unit_t *ul, off_t lof)
{
	cirbuf_t	*cb;
	buf_t		*bp;
	off_t		rlof;

	/*
	 * retrieve as much data as possible from the incore buffers
	 */
	if ((bp = find_bp(ul, &ul->un_wrbuf, lof)) != NULL) {
		logstats.ls_lreadsinmem.value.ui64++;
		return (bp);
	}
	if ((bp = find_bp(ul, &ul->un_rdbuf, lof)) != NULL) {
		logstats.ls_lreadsinmem.value.ui64++;
		return (bp);
	}

	/*
	 * steal the LRU buf
	 */
	cb = &ul->un_rdbuf;
	rw_enter(&cb->cb_rwlock, RW_WRITER);
	bp = cb->cb_bp->b_forw;
	makebusy(ul, bp);
	bp->b_flags = 0;
	bp->b_bcount = 0;
	cb->cb_bp = bp;
	rw_exit(&cb->cb_rwlock);

	/*
	 * don't read past the tail or the end-of-log
	 */
	bp->b_blkno = btodb(lof);
	lof = dbtob(bp->b_blkno);
	rlof = find_read_lof(ul, &ul->un_wrbuf, lof);
	bp->b_bcount = MIN(bp->b_bufsize, rlof - lof);
	readlog(ul, bp);
	return (bp);
}

/*
 * NOTE: writers are single threaded thru the log layer.
 * This means we can safely reference and change the cb and bp fields
 * that ldl_read does not reference w/o holding the cb_rwlock or
 * the bp makebusy lock.
 */
static int
extend_write_bp(ml_unit_t *ul, cirbuf_t *cb, buf_t *bp)
{
	buf_t	*bpforw	= bp->b_forw;

	ASSERT(bp == cb->cb_bp && bp == cb->cb_dirty);

	/*
	 * there is no `next' bp; do nothing
	 */
	if (bpforw == bp)
		return (0);

	/*
	 * buffer space is not adjacent; do nothing
	 */
	if ((bp->b_un.b_addr + bp->b_bufsize) != bpforw->b_un.b_addr)
		return (0);

	/*
	 * locking protocol requires giving up any bp locks before
	 * acquiring cb_rwlock.  This is okay because we hold
	 * un_log_mutex.
	 */
	sema_v(&bp->b_sem);

	/*
	 * lock out ldl_read
	 */
	rw_enter(&cb->cb_rwlock, RW_WRITER);

	/*
	 * wait for current IO to finish w/next bp; if necessary
	 */
	makebusy(ul, bpforw);

	/*
	 * free the next bp and steal its space
	 */
	bp->b_forw = bpforw->b_forw;
	bpforw->b_forw->b_back = bp;
	bp->b_bufsize += bpforw->b_bufsize;
	sema_v(&bpforw->b_sem);
	bpforw->b_forw = cb->cb_free;
	cb->cb_free = bpforw;
	makebusy(ul, bp);
	rw_exit(&cb->cb_rwlock);

	return (1);
}

static size_t
storebuf(ml_unit_t *ul, buf_t *bp, caddr_t va, size_t nb)
{
	size_t		copy_nb;
	size_t		nb_in_sec;
	sect_trailer_t	*st;
	size_t		nb_left = nb;
	cirbuf_t	*cb	= &ul->un_wrbuf;

again:
	nb_in_sec = NB_LEFT_IN_SECTOR(bp->b_bcount);
	copy_nb = MIN(nb_left, nb_in_sec);

	ASSERT(copy_nb);

	bcopy(va, bp->b_un.b_addr + bp->b_bcount, copy_nb);
	bp->b_bcount += copy_nb;
	va += copy_nb;
	nb_left -= copy_nb;
	ul->un_tail_lof += copy_nb;

	if ((nb_in_sec -= copy_nb) == 0) {
		st = (sect_trailer_t *)(bp->b_un.b_addr + bp->b_bcount);

		st->st_tid = ul->un_logmap->mtm_tid;
		st->st_ident = ul->un_tail_ident++;
		bp->b_bcount += sizeof (sect_trailer_t);
		ul->un_tail_lof += sizeof (sect_trailer_t);
		/*
		 * log wrapped; async write this bp
		 */
		if (ul->un_tail_lof == ul->un_eol_lof) {
			ul->un_tail_lof = ul->un_bol_lof;
			push_dirty_bp(ul, bp);
			return (nb - nb_left);
		}
		/*
		 * out of bp space; get more or async write buf
		 */
		if (bp->b_bcount == bp->b_bufsize) {
			if (!extend_write_bp(ul, cb, bp)) {
				push_dirty_bp(ul, bp);
				return (nb - nb_left);
			}
		}
	}
	if (nb_left)
		goto again;

	sema_v(&bp->b_sem);
	return (nb);
}

static void
fetchzeroes(caddr_t dst_va, offset_t dst_mof, ulong_t dst_nb, mapentry_t *me)
{
	offset_t	src_mof	= me->me_mof;
	size_t		src_nb	= me->me_nb;

	if (src_mof > dst_mof) {
		ASSERT(src_mof < (dst_mof + dst_nb));
		dst_va += (src_mof - dst_mof);
		dst_nb -= (src_mof - dst_mof);
	} else {
		ASSERT(dst_mof < (src_mof + src_nb));
		src_nb -= (dst_mof - src_mof);
	}

	src_nb = MIN(src_nb, dst_nb);
	ASSERT(src_nb);
	bzero(dst_va, src_nb);
}

/*
 * dst_va == NULL means don't copy anything
 */
static ulong_t
fetchbuf(
	ml_unit_t *ul,
	buf_t *bp,
	caddr_t dst_va,
	size_t dst_nb,
	off_t *dst_lofp)
{
	caddr_t	copy_va;
	size_t	copy_nb;
	size_t	nb_sec;
	off_t	dst_lof		= *dst_lofp;
	ulong_t	sav_dst_nb	= dst_nb;
	ulong_t	src_nb		= bp->b_bcount;
	off_t	src_lof		= dbtob(bp->b_blkno);
	off_t	src_elof	= src_lof + src_nb;
	caddr_t	src_va		= bp->b_un.b_addr;

	/*
	 * copy from bp to dst_va
	 */
	while (dst_nb) {
		/*
		 * compute address within bp
		 */
		copy_va = src_va + (dst_lof - src_lof);

		/*
		 * adjust copy size to amount of data in bp
		 */
		copy_nb = MIN(dst_nb, src_elof - dst_lof);

		/*
		 * adjust copy size to amount of data in sector
		 */
		nb_sec = NB_LEFT_IN_SECTOR(dst_lof);
		copy_nb = MIN(copy_nb, nb_sec);

		/*
		 * dst_va == NULL means don't do copy (see logseek())
		 */
		if (dst_va) {
			bcopy(copy_va, dst_va, copy_nb);
			dst_va += copy_nb;
		}
		dst_lof += copy_nb;
		dst_nb -= copy_nb;
		nb_sec -= copy_nb;

		/*
		 * advance over sector trailer
		 */
		if (nb_sec == 0)
			dst_lof += sizeof (sect_trailer_t);

		/*
		 * exhausted buffer
		 *	return current lof for next read
		 */
		if (dst_lof == src_elof) {
			sema_v(&bp->b_sem);
			if (dst_lof == ul->un_eol_lof)
				dst_lof = ul->un_bol_lof;
			*dst_lofp = dst_lof;
			return (sav_dst_nb - dst_nb);
		}
	}

	/*
	 * copy complete - return current lof
	 */
	sema_v(&bp->b_sem);
	*dst_lofp = dst_lof;
	return (sav_dst_nb);
}

void
ldl_round_commit(ml_unit_t *ul)
{
	int		wrapped;
	buf_t		*bp;
	sect_trailer_t	*st;
	size_t		bcount;
	cirbuf_t	*cb	= &ul->un_wrbuf;

	/*
	 * if nothing to write; then do nothing
	 */
	if ((bp = cb->cb_dirty) == NULL)
		return;
	makebusy(ul, bp);

	/*
	 * round up to sector boundary and set new tail
	 *	don't readjust st_ident if buf is already rounded
	 */
	bcount = P2ROUNDUP(bp->b_bcount, DEV_BSIZE);
	if (bcount == bp->b_bcount) {
		sema_v(&bp->b_sem);
		return;
	}
	bp->b_bcount = bcount;
	ul->un_tail_lof = dbtob(bp->b_blkno) + bcount;
	wrapped = 0;
	if (ul->un_tail_lof == ul->un_eol_lof) {
		ul->un_tail_lof = ul->un_bol_lof;
		++wrapped;
	}
	ASSERT(ul->un_tail_lof != ul->un_head_lof);

	/*
	 * fix up the sector trailer
	 */
	/* LINTED */
	st = (sect_trailer_t *)
	    ((bp->b_un.b_addr + bcount) - sizeof (*st));
	st->st_tid = ul->un_logmap->mtm_tid;
	st->st_ident = ul->un_tail_ident++;

	/*
	 * if tail wrapped or we have exhausted this buffer
	 *	async write the buffer
	 */
	if (wrapped || bcount == bp->b_bufsize)
		push_dirty_bp(ul, bp);
	else
		sema_v(&bp->b_sem);
}

void
ldl_push_commit(ml_unit_t *ul)
{
	buf_t		*bp;
	cirbuf_t	*cb	= &ul->un_wrbuf;

	/*
	 * if nothing to write; then do nothing
	 */
	if ((bp = cb->cb_dirty) == NULL)
		return;
	makebusy(ul, bp);
	push_dirty_bp(ul, bp);
}

int
ldl_need_commit(ml_unit_t *ul)
{
	return (ul->un_resv > (ul->un_maxresv - (ul->un_maxresv>>2)));
}

int
ldl_has_space(ml_unit_t *ul, mapentry_t *me)
{
	off_t	nfb;
	off_t	nb;

	ASSERT(MUTEX_HELD(&ul->un_log_mutex));

	/*
	 * Add up the size used by the deltas
	 * round nb up to a sector length plus an extra sector
	 *	w/o the extra sector we couldn't distinguish
	 *	a full log (head == tail) from an empty log (head == tail)
	 */
	for (nb = DEV_BSIZE; me; me = me->me_hash) {
		nb += sizeof (struct delta);
		if (me->me_dt != DT_CANCEL)
			nb += me->me_nb;
	}
	nb = P2ROUNDUP(nb, DEV_BSIZE);

	if (ul->un_head_lof <= ul->un_tail_lof)
		nfb = (ul->un_head_lof - ul->un_bol_lof) +
		    (ul->un_eol_lof - ul->un_tail_lof);
	else
		nfb = ul->un_head_lof - ul->un_tail_lof;

	return (nb < nfb);
}

void
ldl_write(ml_unit_t *ul, caddr_t bufp, offset_t bufmof, struct mapentry *me)
{
	buf_t		*bp;
	caddr_t		va;
	size_t		nb;
	size_t		actual;

	ASSERT(MUTEX_HELD(&ul->un_log_mutex));

	/* Write the delta */

	nb = sizeof (struct delta);
	va = (caddr_t)&me->me_delta;
	bp = get_write_bp(ul);

	while (nb) {
		if (ul->un_flags & LDL_ERROR) {
			sema_v(&bp->b_sem);
			return;
		}
		actual = storebuf(ul, bp, va, nb);
		ASSERT(actual);
		va += actual;
		nb -= actual;
		if (nb)
			bp = get_write_bp(ul);
	}

	/* If a commit, cancel, or 0's; we're almost done */
	switch (me->me_dt) {
		case DT_COMMIT:
		case DT_CANCEL:
		case DT_ABZERO:
			/* roll needs to know where the next delta will go */
			me->me_lof = ul->un_tail_lof;
			return;
		default:
			break;
	}

	/* Now write the data */

	ASSERT(me->me_nb != 0);

	nb = me->me_nb;
	va = (me->me_mof - bufmof) + bufp;
	bp = get_write_bp(ul);

	/* Save where we will put the data */
	me->me_lof = ul->un_tail_lof;

	while (nb) {
		if (ul->un_flags & LDL_ERROR) {
			sema_v(&bp->b_sem);
			return;
		}
		actual = storebuf(ul, bp, va, nb);
		ASSERT(actual);
		va += actual;
		nb -= actual;
		if (nb)
			bp = get_write_bp(ul);
	}
}

void
ldl_waito(ml_unit_t *ul)
{
	buf_t		*bp;
	cirbuf_t	*cb	= &ul->un_wrbuf;

	rw_enter(&cb->cb_rwlock, RW_WRITER);
	/*
	 * wait on them
	 */
	bp = cb->cb_bp;
	do {
		if ((bp->b_flags & B_DONE) == 0) {
			makebusy(ul, bp);
			sema_v(&bp->b_sem);
		}
		bp = bp->b_forw;
	} while (bp != cb->cb_bp);
	rw_exit(&cb->cb_rwlock);
}

/*
 * seek nb bytes from location lof
 */
static int
logseek(ml_unit_t *ul, off_t lof, size_t nb, off_t *lofp)
{
	buf_t	*bp;
	ulong_t	actual;

	while (nb) {
		bp = get_read_bp(ul, lof);
		if (bp->b_flags & B_ERROR) {
			sema_v(&bp->b_sem);
			return (EIO);
		}
		actual = fetchbuf(ul, bp, NULL, nb, &lof);
		ASSERT(actual);
		nb -= actual;
	}
	*lofp = lof;
	ASSERT(nb == 0);
	return (0);
}

int
ldl_read(
	ml_unit_t *ul,		/* Log unit */
	caddr_t va,		/* address of buffer to read into */
	offset_t mof,		/* mof of buffer */
	off_t nb,		/* length of buffer */
	mapentry_t *me)		/* Map entry list */
{
	buf_t	*bp;
	crb_t   *crb;
	caddr_t	rva;			/* address to read into */
	size_t	rnb;			/* # of bytes to read */
	off_t	lof;			/* log device offset to read from */
	off_t   skip;
	ulong_t	actual;
	int	error;
	caddr_t	eva	= va + nb;	/* end of buffer */

	for (; me; me = me->me_agenext) {
		ASSERT(me->me_dt != DT_CANCEL);

		/*
		 * check for an cached roll buffer
		 */
		crb = me->me_crb;
		if (crb) {
			if (mof > crb->c_mof) {
				/*
				 * This mapentry overlaps with the beginning of
				 * the supplied buffer
				 */
				skip = mof - crb->c_mof;
				bcopy(crb->c_buf + skip, va,
				    MIN(nb, crb->c_nb - skip));
			} else {
				/*
				 * This mapentry starts at or after
				 * the supplied buffer.
				 */
				skip = crb->c_mof - mof;
				bcopy(crb->c_buf, va + skip,
				    MIN(crb->c_nb, nb - skip));
			}
			logstats.ls_lreadsinmem.value.ui64++;
			continue;
		}

		/*
		 * check for a delta full of zeroes - there's no log data
		 */
		if (me->me_dt == DT_ABZERO) {
			fetchzeroes(va, mof, nb, me);
			continue;
		}

		if (mof > me->me_mof) {
			rnb = (size_t)(mof - me->me_mof);
			error = logseek(ul, me->me_lof, rnb, &lof);
			if (error)
				return (EIO);
			rva = va;
			rnb = me->me_nb - rnb;
			rnb = ((rva + rnb) > eva) ? eva - rva : rnb;
		} else {
			lof = me->me_lof;
			rva = (me->me_mof - mof) + va;
			rnb = ((rva + me->me_nb) > eva) ? eva - rva : me->me_nb;
		}

		while (rnb) {
			bp = get_read_bp(ul, lof);
			if (bp->b_flags & B_ERROR) {
				sema_v(&bp->b_sem);
				return (EIO);
			}
			ASSERT(((me->me_flags & ME_ROLL) == 0) ||
			    (bp != ul->un_wrbuf.cb_dirty));
			actual = fetchbuf(ul, bp, rva, rnb, &lof);
			ASSERT(actual);
			rva += actual;
			rnb -= actual;
		}
	}
	return (0);
}

void
ldl_savestate(ml_unit_t *ul)
{
	int		error;
	buf_t		*bp	= ul->un_bp;
	ml_odunit_t	*ud	= (void *)bp->b_un.b_addr;
	ml_odunit_t	*ud2	= (void *)(bp->b_un.b_addr + DEV_BSIZE);

#if	DEBUG
	/*
	 * Scan test is running; don't update intermediate state
	 */
	if (ul->un_logmap && ul->un_logmap->mtm_trimlof)
		return;
#endif	/* DEBUG */

	mutex_enter(&ul->un_state_mutex);
	bcopy(&ul->un_ondisk, ud, sizeof (*ud));
	ud->od_chksum = ud->od_head_ident + ud->od_tail_ident;
	bcopy(ud, ud2, sizeof (*ud));

	/* If a snapshot is enabled write through the shapshot driver. */
	if (ul->un_ufsvfs->vfs_snapshot)
		UFS_BWRITE2(ul->un_ufsvfs, bp);
	else
		BWRITE2(bp);
	logstats.ls_ldlwrites.value.ui64++;
	error = bp->b_flags & B_ERROR;
	mutex_exit(&ul->un_state_mutex);
	if (error)
		ldl_seterror(ul, "Error writing ufs log state");
}

/*
 * The head will be set to (new_lof - header) since ldl_sethead is
 * called with the new_lof of the data portion of a delta.
 */
void
ldl_sethead(ml_unit_t *ul, off_t data_lof, uint32_t tid)
{
	off_t		nb;
	off_t		new_lof;
	uint32_t	new_ident;
	daddr_t		beg_blkno;
	daddr_t		end_blkno;
	struct timeval	tv;

	ASSERT(MUTEX_HELD(&ul->un_log_mutex));

	if (data_lof == -1) {
		/* log is empty */
		uniqtime(&tv);
		if (tv.tv_usec == ul->un_head_ident) {
			tv.tv_usec++;
		}
		last_loghead_ident = tv.tv_usec;
		new_ident = tv.tv_usec;
		new_lof = ul->un_tail_lof;

	} else {
		/* compute header's lof */
		new_ident = ul->un_head_ident;
		new_lof = data_lof - sizeof (struct delta);

		/* whoops, header spans sectors; subtract out sector trailer */
		if (btodb(new_lof) != btodb(data_lof))
			new_lof -= sizeof (sect_trailer_t);

		/* whoops, header wrapped the log; go to last sector */
		if (new_lof < ul->un_bol_lof) {
			/* sector offset */
			new_lof -= dbtob(btodb(new_lof));
			/* add to last sector's lof */
			new_lof += (ul->un_eol_lof - DEV_BSIZE);
		}
		ul->un_head_tid = tid;
	}

	/*
	 * check for nop
	 */
	if (new_lof == ul->un_head_lof)
		return;

	/*
	 * invalidate the affected bufs and calculate new ident
	 */
	if (new_lof > ul->un_head_lof) {
		nb = new_lof - ul->un_head_lof;
		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);

		end_blkno = btodb(new_lof);
		beg_blkno = btodb(ul->un_head_lof);
		new_ident += (end_blkno - beg_blkno);
	} else {
		nb = ul->un_eol_lof - ul->un_head_lof;
		inval_range(ul, &ul->un_wrbuf, ul->un_head_lof, nb);
		inval_range(ul, &ul->un_rdbuf, ul->un_head_lof, nb);

		end_blkno = btodb(ul->un_eol_lof);
		beg_blkno = btodb(ul->un_head_lof);
		new_ident += (end_blkno - beg_blkno);

		nb = new_lof - ul->un_bol_lof;
		inval_range(ul, &ul->un_wrbuf, ul->un_bol_lof, nb);
		inval_range(ul, &ul->un_rdbuf, ul->un_bol_lof, nb);

		end_blkno = btodb(new_lof);
		beg_blkno = btodb(ul->un_bol_lof);
		new_ident += (end_blkno - beg_blkno);
	}
	/*
	 * don't update the head if there has been an error
	 */
	if (ul->un_flags & LDL_ERROR)
		return;

	/* Fix up the head and ident */
	ASSERT(new_lof >= ul->un_bol_lof);
	ul->un_head_lof = new_lof;
	ul->un_head_ident = new_ident;
	if (data_lof == -1) {
		ul->un_tail_ident = ul->un_head_ident;
	}


	/* Commit to the database */
	ldl_savestate(ul);

	ASSERT(((ul->un_logmap->mtm_debug & MT_SCAN) == 0) ||
	    ldl_sethead_debug(ul));
}

/*
 * The tail will be set to the sector following lof+nb
 *	lof + nb == size of the last delta + commit record
 *	this function is called once after the log scan has completed.
 */
void
ldl_settail(ml_unit_t *ul, off_t lof, size_t nb)
{
	off_t		new_lof;
	uint32_t	new_ident;
	daddr_t		beg_blkno;
	daddr_t		end_blkno;
	struct timeval	tv;

	ASSERT(MUTEX_HELD(&ul->un_log_mutex));

	if (lof == -1) {
		uniqtime(&tv);
		if (tv.tv_usec == ul->un_head_ident) {
			tv.tv_usec++;
		}
		last_loghead_ident = tv.tv_usec;
		ul->un_tail_lof = dbtob(btodb(ul->un_head_lof));
		ul->un_head_lof = ul->un_tail_lof;
		ul->un_head_ident = tv.tv_usec;
		ul->un_tail_ident = ul->un_head_ident;

		/* Commit to the database */
		ldl_savestate(ul);

		return;
	}

	/*
	 * new_lof is the offset of the sector following the last commit
	 */
	(void) logseek(ul, lof, nb, &new_lof);
	ASSERT(new_lof != dbtob(btodb(ul->un_head_lof)));

	/*
	 * calculate new ident
	 */
	if (new_lof > ul->un_head_lof) {
		end_blkno = btodb(new_lof);
		beg_blkno = btodb(ul->un_head_lof);
		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);
	} else {
		end_blkno = btodb(ul->un_eol_lof);
		beg_blkno = btodb(ul->un_head_lof);
		new_ident = ul->un_head_ident + (end_blkno - beg_blkno);

		end_blkno = btodb(new_lof);
		beg_blkno = btodb(ul->un_bol_lof);
		new_ident += (end_blkno - beg_blkno);
	}

	/* Fix up the tail and ident */
	ul->un_tail_lof = new_lof;
	ul->un_tail_ident = new_ident;

	/* Commit to the database */
	ldl_savestate(ul);
}

/*
 * LOGSCAN STUFF
 */
static int
ldl_logscan_ident(ml_unit_t *ul, buf_t *bp, off_t lof)
{
	ulong_t		ident;
	size_t		nblk, i;
	sect_trailer_t	*st;

	/*
	 * compute ident for first sector in the buffer
	 */
	ident = ul->un_head_ident;
	if (bp->b_blkno >= btodb(ul->un_head_lof)) {
		ident += (bp->b_blkno - btodb(ul->un_head_lof));
	} else {
		ident += (btodb(ul->un_eol_lof) - btodb(ul->un_head_lof));
		ident += (bp->b_blkno - btodb(ul->un_bol_lof));
	}
	/*
	 * truncate the buffer down to the last valid sector
	 */
	nblk = btodb(bp->b_bcount);
	bp->b_bcount = 0;
	/* LINTED */
	st = (sect_trailer_t *)(bp->b_un.b_addr + LDL_USABLE_BSIZE);
	for (i = 0; i < nblk; ++i) {
		if (st->st_ident != ident)
			break;

		/* remember last valid tid for ldl_logscan_error() */
		ul->un_tid = st->st_tid;

		/* LINTED */
		st = (sect_trailer_t *)(((caddr_t)st) + DEV_BSIZE);
		++ident;
		bp->b_bcount += DEV_BSIZE;
	}
	/*
	 * make sure that lof is still within range
	 */
	return (within_range(lof, bp->b_blkno, bp->b_bcount));
}

ulong_t
ldl_logscan_nbcommit(off_t lof)
{
	/*
	 * lof is the offset following the commit header.  However,
	 * if the commit header fell on the end-of-sector, then lof
	 * has already been advanced to the beginning of the next
	 * sector.  So do nothing.  Otherwise, return the remaining
	 * bytes in the sector.
	 */
	if ((lof & (DEV_BSIZE - 1)) == 0)
		return (0);
	return (NB_LEFT_IN_SECTOR(lof));
}

int
ldl_logscan_read(ml_unit_t *ul, off_t *lofp, size_t nb, caddr_t va)
{
	buf_t	*bp;
	ulong_t	actual;

	ASSERT(ul->un_head_lof != ul->un_tail_lof);

	/*
	 * Check the log data doesn't go out of bounds
	 */
	if (ul->un_head_lof < ul->un_tail_lof) {
		if (!WITHIN(*lofp, nb, ul->un_head_lof,
		    (ul->un_tail_lof - ul->un_head_lof))) {
			return (EIO);
		}
	} else {
		if (OVERLAP(*lofp, nb, ul->un_tail_lof,
		    (ul->un_head_lof - ul->un_tail_lof))) {
			return (EIO);
		}
	}

	while (nb) {
		bp = get_read_bp(ul, *lofp);
		if (bp->b_flags & B_ERROR) {
			sema_v(&bp->b_sem);
			return (EIO);
		}
		/*
		 * out-of-seq idents means partial transaction
		 *	panic, non-corrupting powerfail, ...
		 */
		if (!ldl_logscan_ident(ul, bp, *lofp)) {
			sema_v(&bp->b_sem);
			return (EIO);
		}
		/*
		 * copy the header into the caller's buf
		 */
		actual = fetchbuf(ul, bp, va, nb, lofp);
		if (va)
			va += actual;
		nb -= actual;
	}
	return (0);
}

void
ldl_logscan_begin(ml_unit_t *ul)
{
	size_t	bufsize;

	ASSERT(ul->un_wrbuf.cb_dirty == NULL);

	/*
	 * logscan has begun
	 */
	ul->un_flags |= LDL_SCAN;

	/*
	 * reset the circular bufs
	 */
	bufsize = ldl_bufsize(ul);
	alloc_rdbuf(&ul->un_rdbuf, bufsize, bufsize);
	alloc_wrbuf(&ul->un_wrbuf, bufsize);

	/*
	 * set the tail to reflect a full log
	 */
	ul->un_tail_lof = dbtob(btodb(ul->un_head_lof)) - DEV_BSIZE;

	if (ul->un_tail_lof < ul->un_bol_lof)
		ul->un_tail_lof = ul->un_eol_lof - DEV_BSIZE;
	if (ul->un_tail_lof >= ul->un_eol_lof)
		ul->un_tail_lof = ul->un_bol_lof;

	/*
	 * un_tid is used during error processing; it is initialized to
	 * the tid of the delta at un_head_lof;
	 */
	ul->un_tid = ul->un_head_tid;
}

void
ldl_logscan_end(ml_unit_t *ul)
{
	size_t	bufsize;

	/*
	 * reset the circular bufs
	 */
	bufsize = ldl_bufsize(ul);
	alloc_rdbuf(&ul->un_rdbuf, MAPBLOCKSIZE, MAPBLOCKSIZE);
	alloc_wrbuf(&ul->un_wrbuf, bufsize);

	/*
	 * Done w/scan
	 */
	ul->un_flags &= ~LDL_SCAN;
}

int
ldl_need_roll(ml_unit_t *ul)
{
	off_t	busybytes;
	off_t	head;
	off_t	tail;
	off_t	bol;
	off_t	eol;
	off_t	nb;

	/*
	 * snapshot the log state
	 */
	head = ul->un_head_lof;
	tail = ul->un_tail_lof;
	bol = ul->un_bol_lof;
	eol = ul->un_eol_lof;
	nb = ul->un_logsize;

	/*
	 * compute number of busy (inuse) bytes
	 */
	if (head <= tail)
		busybytes = tail - head;
	else
		busybytes = (eol - head) + (tail - bol);

	/*
	 * return TRUE if > 75% full
	 */
	return (busybytes > (nb - (nb >> 2)));
}

void
ldl_seterror(ml_unit_t *ul, char *why)
{
	/*
	 * already in error state; do nothing
	 */
	if (ul->un_flags & LDL_ERROR)
		return;

	ul->un_flags |= LDL_ERROR;	/* incore */
	ul->un_badlog = 1;		/* ondisk (cleared by fsck) */

	/*
	 * Commit to state sectors
	 */
	uniqtime(&ul->un_timestamp);
	ldl_savestate(ul);

	/* Pretty print */
	cmn_err(CE_WARN, "%s", why);
	cmn_err(CE_WARN, "ufs log for %s changed state to Error",
	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);
	cmn_err(CE_WARN, "Please umount(1M) %s and run fsck(1M)",
	    ul->un_ufsvfs->vfs_fs->fs_fsmnt);

	/*
	 * If we aren't in the middle of scan (aka snarf); tell ufs
	 * to hard lock itself.
	 */
	if ((ul->un_flags & LDL_SCAN) == 0)
		ufs_trans_onerror();
}

size_t
ldl_bufsize(ml_unit_t *ul)
{
	size_t		bufsize;
	extern uint32_t	ldl_minbufsize;

	/*
	 * initial guess is the maxtransfer value for this log device
	 * 	increase if too small
	 * 	decrease if too large
	 */
	bufsize = dbtob(btod(ul->un_maxtransfer));
	if (bufsize < ldl_minbufsize)
		bufsize = ldl_minbufsize;
	if (bufsize > maxphys)
		bufsize = maxphys;
	if (bufsize > ul->un_maxtransfer)
		bufsize = ul->un_maxtransfer;
	return (bufsize);
}