md_mddb.c (revision 32c22d57860198538fb6b8f261cb76ab26318d34) - OpenGrok cross reference for /titanic_41/usr/src/uts/common/io/lvm/md/md_mddb.c

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/types.h>
#include <sys/conf.h>
#include <sys/time.h>
#include <sys/uio.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/systeminfo.h>
#include <sys/sysmacros.h>
#include <sys/buf.h>
#include <sys/kmem.h>
#include <sys/file.h>
#include <sys/open.h>
#include <sys/debug.h>
#include <sys/stat.h>
#include <sys/lvm/mdvar.h>
#include <sys/lvm/md_crc.h>
#include <sys/lvm/md_convert.h>
#include <sys/types.h>
#include <sys/kmem.h>
#include <sys/lvm/mdmn_commd.h>
#include <sys/cladm.h>

mhd_mhiargs_t	defmhiargs = {
	1000,
	{ 6000, 6000, 30000 }
};

#define	MDDB

#include <sys/lvm/mdvar.h>
#include <sys/lvm/mdmed.h>
#include <sys/lvm/md_names.h>
#include <sys/cred.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/esunddi.h>

#include <sys/sysevent/eventdefs.h>
#include <sys/sysevent/svm.h>

extern char svm_bootpath[];

int			md_maxbootlist = MAXBOOTLIST;
static ulong_t		mddb_maxblocks = 0;	/* tune for small records */
static int		mddb_maxbufheaders = 50;
static uint_t		mddb_maxcopies = MDDB_NLB;

/*
 * If this is set, more detailed messages about DB init will be given, instead
 * of just the MDE_DB_NODB.
 */
static int		mddb_db_err_detail = 0;

/*
 * This lock is used to single-thread load/unload of all sets
 */
static kmutex_t		mddb_lock;

/*
 * You really do NOT want to change this boolean.
 * It can be VERY dangerous to do so.  Loss of
 * data may occur. USE AT YOUR OWN RISK!!!!
 */
static int		mddb_allow_half = 0;
/*
 * For mirrored root allow reboot with only half the replicas available
 * Flag inserted for Santa Fe project.
 */
int mirrored_root_flag;

#define	ISWHITE(c)	(((c) == ' ') || ((c) == '\t') || \
			    ((c) == '\r') || ((c) == '\n'))
#define	ISNUM(c)	(((c) >= '0') && ((c) <= '9'))

#define	SETMUTEX(setno)	(&md_set[setno].s_dbmx)

extern md_krwlock_t	md_unit_array_rw;	/* md.c */
extern set_t		md_nsets;		/* md.c */
extern int		md_nmedh;		/* md.c */
extern md_set_t		md_set[];		/* md.c */
extern int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);
extern dev_info_t	*md_devinfo;
extern int		md_init_debug;
extern int		md_status;
extern md_ops_t		*md_opslist;
extern md_krwlock_t	nm_lock;

static int 		update_locatorblock(mddb_set_t *s, md_dev64_t dev,
				ddi_devid_t didptr, ddi_devid_t old_didptr);

/*
 * Defines for crc calculation for records
 * rec_crcgen generates a crc checksum for a record block
 * rec_crcchk checks the crc checksum for a record block
 */
#define	REC_CRCGEN	0
#define	REC_CRCCHK	1
#define	rec_crcgen(s, dep, rbp) \
	(void) rec_crcfunc(s, dep, rbp, REC_CRCGEN)
#define	rec_crcchk(s, dep, rbp) \
	rec_crcfunc(s, dep, rbp, REC_CRCCHK)

/*
 * During upgrade, SVM basically runs with the devt from the target
 * being upgraded.  Translations are made from the target devt to the
 * miniroot devt when writing data out to the disk.  This is done by
 * the following routines:
 *	wrtblklst
 *	writeblks
 *	readblklst
 *	readblks
 *	dt_read
 *
 * The following routines are used by the routines listed above and
 * expect a translated (aka miniroot) devt:
 *	getblks
 * 	getmasters
 *
 * Also, when calling any system routines, such as ddi_lyr_get_devid,
 * the translated (aka miniroot) devt must be used.
 *
 * By the same token, the major number and major name conversion operations
 * need to use the name_to_major file from the target system instead
 * of the name_to_major file on the miniroot.  So, calls to
 * ddi_name_to_major must be replaced with calls to md_targ_name_to_major
 * when running on an upgrade.  Same is true with calls to
 * ddi_major_to_name.
 */


#ifndef MDDB_FAKE

static int
mddb_rwdata(
	mddb_set_t	*s,	/* incore db set structure */
	int		flag,	/* B_ASYNC, B_FAILFAST or 0 passed in here */
	buf_t		*bp
)
{
	int		err = 0;

	bp->b_flags = (flag | B_BUSY) & (~B_ASYNC);

	mutex_exit(SETMUTEX(s->s_setno));
	if (mdv_strategy_tstpnt == NULL ||
	    (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0)
		(void) bdev_strategy(bp);

	if (flag & B_ASYNC) {
		mutex_enter(SETMUTEX(s->s_setno));
		return (0);
	}

	err = biowait(bp);
	mutex_enter(SETMUTEX(s->s_setno));
	return (err);
}

static void
setidentifier(
	mddb_set_t	*s,
	identifier_t	*ident
)
{
	if (s->s_setno == MD_LOCAL_SET)
		(void) strcpy(&ident->serial[0], s->s_ident.serial);
	else
		ident->createtime = s->s_ident.createtime;
}

static int
cmpidentifier(
	mddb_set_t	*s,
	identifier_t	*ident
)
{
	if (s->s_setno == MD_LOCAL_SET)
		return (strcmp(ident->serial, s->s_ident.serial));
	else
		return (timercmp(&ident->createtime,
		    /*CSTYLED*/
		    &s->s_ident.createtime, !=));
}

static int
mddb_devopen(
	md_dev64_t	dev
)
{
	dev_t		ddi_dev = md_dev64_to_dev(dev);

	if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0)
		return (0);
	return (1);
}

static void
mddb_devclose(
	md_dev64_t	dev
)
{
	(void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred);
}

/*
 * stripe_skip_ts
 *
 * Returns a list of fields to be skipped in the stripe record structure.
 * These fields are ms_timestamp in the component structure.
 * Used to skip these fields when calculating the checksum.
 */
static crc_skip_t *
stripe_skip_ts(void *un, uint_t revision)
{
	struct ms_row32_od	*small_mdr;
	struct ms_row		*big_mdr;
	uint_t			row, comp, ncomps, compoff;
	crc_skip_t		*skip;
	crc_skip_t		*skip_prev;
	crc_skip_t		skip_start = {0, 0, 0};
	ms_unit_t		*big_un;
	ms_unit32_od_t		*small_un;
	uint_t			rb_off = offsetof(mddb_rb32_t, rb_data[0]);

	switch (revision) {
	case MDDB_REV_RB:
	case MDDB_REV_RBFN:
		small_un = (ms_unit32_od_t *)un;
		skip_prev = &skip_start;

		if (small_un->un_nrows == 0)
			return (NULL);
		/*
		 * walk through all rows to find the total number
		 * of components
		 */
		small_mdr   = &small_un->un_row[0];
		ncomps = 0;
		for (row = 0; (row < small_un->un_nrows); row++) {
			ncomps += small_mdr[row].un_ncomp;
		}

		/* Now walk through the components */
		compoff = small_un->un_ocomp + rb_off;
		for (comp = 0; (comp < ncomps); ++comp) {
			uint_t	mdcp = compoff +
			    (comp * sizeof (ms_comp32_od_t));
			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
			    KM_SLEEP);
			skip->skip_offset = mdcp +
			    offsetof(ms_comp32_od_t, un_mirror.ms_timestamp);
			skip->skip_size = sizeof (md_timeval32_t);
			skip_prev->skip_next = skip;
			skip_prev = skip;
		}
		break;
	case MDDB_REV_RB64:
	case MDDB_REV_RB64FN:
		big_un = (ms_unit_t *)un;
		skip_prev = &skip_start;

		if (big_un->un_nrows == 0)
			return (NULL);
		/*
		 * walk through all rows to find the total number
		 * of components
		 */
		big_mdr   = &big_un->un_row[0];
		ncomps = 0;
		for (row = 0; (row < big_un->un_nrows); row++) {
			ncomps += big_mdr[row].un_ncomp;
		}

		/* Now walk through the components */
		compoff = big_un->un_ocomp + rb_off;
		for (comp = 0; (comp < ncomps); ++comp) {
			uint_t	mdcp = compoff +
			    (comp * sizeof (ms_comp_t));
			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
			    KM_SLEEP);
			skip->skip_offset = mdcp +
			    offsetof(ms_comp_t, un_mirror.ms_timestamp);
			skip->skip_size = sizeof (md_timeval32_t);
			skip_prev->skip_next = skip;
			skip_prev = skip;
		}
		break;
	}
	/* Return the start of the list of fields to skip */
	return (skip_start.skip_next);
}

/*
 * mirror_skip_ts
 *
 * Returns a list of fields to be skipped in the mirror record structure.
 * This includes un_last_read and sm_timestamp for each submirror
 * Used to skip these fields when calculating the checksum.
 */
static crc_skip_t *
mirror_skip_ts(uint_t revision)
{
	int		i;
	crc_skip_t	*skip;
	crc_skip_t	*skip_prev;
	crc_skip_t	skip_start = {0, 0, 0};
	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);

	skip_prev = &skip_start;

	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
	switch (revision) {
	case MDDB_REV_RB:
	case MDDB_REV_RBFN:
		skip->skip_offset = offsetof(mm_unit32_od_t,
		    un_last_read) + rb_off;
		break;
	case MDDB_REV_RB64:
	case MDDB_REV_RB64FN:
		skip->skip_offset = offsetof(mm_unit_t,
		    un_last_read) + rb_off;
		break;
	}
	skip->skip_size = sizeof (int);
	skip_prev->skip_next = skip;
	skip_prev = skip;

	for (i = 0; i < NMIRROR; i++) {
		skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
		switch (revision) {
		case MDDB_REV_RB:
		case MDDB_REV_RBFN:
			skip->skip_offset = offsetof(mm_unit32_od_t,
			    un_sm[i].sm_timestamp) + rb_off;
			break;
		case MDDB_REV_RB64:
		case MDDB_REV_RB64FN:
			skip->skip_offset = offsetof(mm_unit_t,
			    un_sm[i].sm_timestamp) + rb_off;
			break;
		}
		skip->skip_size = sizeof (md_timeval32_t);
		skip_prev->skip_next = skip;
		skip_prev = skip;
	}
	/* Return the start of the list of fields to skip */
	return (skip_start.skip_next);
}

/*
 * hotspare_skip_ts
 *
 * Returns a list of the timestamp fields in the hotspare record structure.
 * Used to skip these fields when calculating the checksum.
 */
static crc_skip_t *
hotspare_skip_ts(uint_t revision)
{
	crc_skip_t	*skip;
	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);

	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
	switch (revision) {
	case MDDB_REV_RB:
	case MDDB_REV_RBFN:
		skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) +
		    rb_off;
		break;
	case MDDB_REV_RB64:
	case MDDB_REV_RB64FN:
		skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) +
		    rb_off;
		break;
	}
	skip->skip_size = sizeof (md_timeval32_t);
	return (skip);
}

/*
 * rec_crcfunc
 *
 * Calculate or check the checksum for a record
 * Calculate the crc if check == 0, Check the crc if check == 1
 *
 * Record block may be written by different nodes in a multi-owner diskset
 * (in case of master change), the function rec_crcchk excludes timestamp
 * fields in crc computation of record data.
 * Otherwise, timestamp fields will cause each node to have a different
 * checksum for same record block causing the exclusive-or of all record block
 * checksums and data block record sums to be non-zero after new master writes
 * at least one record block.
 */
static uint_t
rec_crcfunc(
	mddb_set_t	*s,
	mddb_de_ic_t	*dep,
	mddb_rb32_t	*rbp,
	int		check
)
{
	crc_skip_t	*skip;
	crc_skip_t	*skip_tail;
	mddb_type_t	type = dep->de_type1;
	uint_t		ret;

	/*
	 * Generate a list of the areas to be skipped when calculating
	 * the checksum.
	 * First skip rb_checksum, rb_private and rb_userdata.
	 */
	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
	skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle);
	skip->skip_size = 3 * sizeof (uint_t);
	skip_tail = skip;
	if (MD_MNSET_SETNO(s->s_setno)) {
		/* For a MN set, skip rb_timestamp */
		skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
		    KM_SLEEP);
		skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp);
		skip_tail->skip_size = sizeof (md_timeval32_t);
		skip->skip_next = skip_tail;

		/* Now add a list of timestamps to be skipped */
		if (type >= MDDB_FIRST_MODID) {
			switch (dep->de_flags) {
				case MDDB_F_STRIPE:
					skip_tail->skip_next =
					    stripe_skip_ts((void *)rbp->rb_data,
					    rbp->rb_revision);
					break;
				case MDDB_F_MIRROR:
					skip_tail->skip_next =
					    mirror_skip_ts(rbp->rb_revision);
					break;
				case MDDB_F_HOTSPARE:
					skip_tail->skip_next =
					    hotspare_skip_ts(rbp->rb_revision);
					break;
				default:
					break;
			}
		}
	}

	if (check) {
		ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
	} else {
		crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
		ret = rbp->rb_checksum;
	}
	while (skip) {
		crc_skip_t	*skip_save = skip;

		skip = skip->skip_next;
		kmem_free(skip_save, sizeof (crc_skip_t));
	}
	return (ret);
}

static mddb_bf_t *
allocbuffer(
	mddb_set_t	*s,
	int		sleepflag
)
{
	mddb_bf_t	*bfp;

	while ((bfp = s->s_freebufhead) == NULL) {
		if (sleepflag == MDDB_NOSLEEP)
			return ((mddb_bf_t *)NULL);
		++s->s_bufmisses;
#ifdef	DEBUG
		if (s->s_bufmisses == 1)
			cmn_err(CE_NOTE,
			    "md: mddb: set %u sleeping for buffer", s->s_setno);
#endif
		s->s_bufwakeup = 1;
		cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno));
	}
	s->s_freebufhead = bfp->bf_next;
	bzero((caddr_t)bfp, sizeof (*bfp));
	bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf;
	bfp->bf_buf.b_flags = B_BUSY;	/* initialize flags */
	return (bfp);
}

static void
freebuffer(
	mddb_set_t		*s,
	mddb_bf_t	*bfp
)
{
	bfp->bf_next = s->s_freebufhead;
	s->s_freebufhead = bfp;
	if (s->s_bufwakeup) {
		cv_broadcast(&s->s_buf_cv);
		s->s_bufwakeup = 0;
	}
}


static void
blkbusy(
	mddb_set_t	*s,
	mddb_block_t	blk
)
{
	int		bit, byte;

	s->s_freeblkcnt--;
	byte = blk / 8;
	bit = 1 << (blk & 7);
	ASSERT(! (s->s_freebitmap[byte] & bit));
	s->s_freebitmap[byte] |= bit;
}

static void
blkfree(
	mddb_set_t	*s,
	mddb_block_t	blk
)
{
	int		bit, byte;

	s->s_freeblkcnt++;
	byte = blk / 8;
	bit = 1 << (blk & 7);
	ASSERT(s->s_freebitmap[byte] & bit);
	s->s_freebitmap[byte] &= ~bit;
}

static int
blkcheck(
	mddb_set_t	*s,
	mddb_block_t	blk
)
{
	int		bit, byte;

	byte = blk / 8;
	bit = 1 << (blk & 7);
	return (s->s_freebitmap[byte] & bit);
}

/*
 * not fast but simple
 */
static mddb_block_t
getfreeblks(
	mddb_set_t	*s,
	size_t		count
)
{
	int		i;
	size_t		contig;

	contig = 0;
	for (i = 0; i < s->s_totalblkcnt; i++) {
		if (blkcheck(s, i)) {
			contig = 0;
		} else {
			contig++;
			if (contig == count) {
				contig = i - count + 1;
				for (i = (int)contig; i < contig + count; i++)
					blkbusy(s, i);
				return ((mddb_block_t)contig);
			}
		}
	}
	return (0);
}

static void
computefreeblks(
	mddb_set_t	*s
)
{
	mddb_db_t	*dbp;
	mddb_de_ic_t	*dep;
	int		i;
	int		minblks;
	int		freeblks;
	mddb_mb_ic_t	*mbip;
	mddb_lb_t	*lbp;
	mddb_block_t	maxblk;
	mddb_did_db_t	*did_dbp;
	int		nblks;

	minblks = 0;
	lbp = s->s_lbp;
	maxblk = 0;

	/*
	 * Determine the max number of blocks.
	 */
	nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS;
	/*
	 * go through and find highest logical block
	 */
	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
		if (dbp->db_blknum > maxblk)
			maxblk = dbp->db_blknum;
		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
			for (i = 0; i < dep->de_blkcount; i++)
				if (dep->de_blks[i] > maxblk)
					maxblk = dep->de_blks[i];
	}

	for (i = 0; i < lbp->lb_loccnt; i++) {
		mddb_locator_t	*lp = &lbp->lb_locators[i];

		if ((lp->l_flags & MDDB_F_DELETED) ||
		    (lp->l_flags & MDDB_F_EMASTER))
			continue;

		freeblks = 0;
		for (mbip = s->s_mbiarray[i]; mbip != NULL;
		    mbip = mbip->mbi_next) {
			freeblks += mbip->mbi_mddb_mb.mb_blkcnt;
		}
		if (freeblks == 0)	/* this happen when there is no */
			continue;	/*	master blk		*/

		if (freeblks <= maxblk) {
			lp->l_flags |= MDDB_F_TOOSMALL;
			lp->l_flags &= ~MDDB_F_ACTIVE;
		}

		if (freeblks < minblks || minblks == 0)
			minblks = freeblks;
	}
	/*
	 * set up reasonable freespace if no
	 * data bases exist
	 */
	if (minblks == 0)
		minblks = 100;
	if (minblks > nblks)
		minblks = nblks;
	s->s_freeblkcnt = minblks;
	s->s_totalblkcnt = minblks;
	if (! s->s_freebitmapsize) {
		s->s_freebitmapsize = nblks / 8;
		s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize,
		    KM_SLEEP);
	}
	bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize);

	/* locator block sectors */
	for (i = 0; i < s->s_lbp->lb_blkcnt; i++)
		blkbusy(s, i);

	/* locator name sectors */
	for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++)
		blkbusy(s, (s->s_lbp->lb_lnfirstblk + i));

	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
		/* locator block device id information */
		for (i = 0; i < s->s_lbp->lb_didblkcnt; i++)
			blkbusy(s, (s->s_lbp->lb_didfirstblk + i));

		/* disk blocks containing actual device ids */
		did_dbp = s->s_did_icp->did_ic_dbp;
		while (did_dbp) {
			for (i = 0; i < did_dbp->db_blkcnt; i++) {
				blkbusy(s, did_dbp->db_firstblk + i);
			}
			did_dbp = did_dbp->db_next;
		}
	}

	/* Only use data tags if not a MN set */
	if (!(lbp->lb_flags & MDDB_MNSET)) {
		/* Found a bad tag, do NOT mark the data tag blks busy here */
		if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) {
			for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++)
				blkbusy(s, (s->s_lbp->lb_dtfirstblk + i));
		}
	}

	/* directory block/entry sectors */
	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
		blkbusy(s, dbp->db_blknum);
		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
			for (i = 0; i < dep->de_blkcount; i++)
				blkbusy(s, dep->de_blks[i]);
	}
}

/*
 * Add free space to the device id incore free list.
 * Called:
 *    - During startup when all devid blocks are temporarily placed on the
 *       free list
 *    - After a devid has been deleted via the metadb command.
 *    - When mddb_devid_free_get adds unused space from a disk block
 *       to free list
 */
static int
mddb_devid_free_add(
	mddb_set_t *s,
	uint_t firstblk,
	uint_t offset,
	uint_t length
)
{
	mddb_did_free_t	*did_freep;

	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
		return (0);
	}

	did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t),
	    KM_SLEEP);
	did_freep->free_blk = firstblk;
	did_freep->free_offset = offset;
	did_freep->free_length = length;
	did_freep->free_next = s->s_did_icp->did_ic_freep;
	s->s_did_icp->did_ic_freep = did_freep;

	return (0);
}

/*
 * Remove specific free space from the device id incore free list.
 * Called at startup (after all devid blocks have been placed on
 * free list) in order to remove the free space from the list that
 * contains actual devids.
 * Returns 0 if area successfully removed.
 * Returns 1 if no matching area is found - so nothing removed.
 */
static int
mddb_devid_free_delete(
	mddb_set_t *s,
	uint_t firstblk,
	uint_t offset,
	uint_t length
)
{
	int		block_found = 0;
	mddb_did_free_t	*did_freep1;		/* next free block */
	mddb_did_free_t	*did_freep2 = 0;	/* previous free block */
	mddb_did_free_t *did_freep_before;	/* area before offset, len */
	mddb_did_free_t	*did_freep_after;	/* area after offset, len */
	uint_t		old_length;

	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
		return (1);
	}

	/* find free block for this devid */
	did_freep1 = s->s_did_icp->did_ic_freep;
	while (did_freep1) {
		/*
		 * Look through free list of <block, offset, length> to
		 * find our entry in the free list.  Our entry should
		 * exist since the entire devid block was placed into
		 * this free list at startup.  This code is just removing
		 * the non-free (in-use) portions of the devid block so
		 * that the remaining linked list does indeed just
		 * contain a free list.
		 *
		 * Our entry has been found if
		 *   - the blocks match,
		 *   - the offset (starting address) in the free list is
		 *	less than the offset of our entry and
		 *   - the length+offset (ending address) in the free list is
		 *	greater than the length+offset of our entry.
		 */
		if ((did_freep1->free_blk == firstblk) &&
		    (did_freep1->free_offset <= offset) &&
		    ((did_freep1->free_length + did_freep1->free_offset) >=
		    (length + offset))) {
			/* Have found our entry - remove from list */
			block_found = 1;
			did_freep_before = did_freep1;
			old_length = did_freep1->free_length;
			/* did_freep1 - pts to next free block */
			did_freep1 = did_freep1->free_next;
			if (did_freep2) {
				did_freep2->free_next = did_freep1;
			} else {
				s->s_did_icp->did_ic_freep = did_freep1;
			}

			/*
			 * did_freep_before points to area in block before
			 * offset, length.
			 */
			did_freep_before->free_length = offset -
			    did_freep_before->free_offset;
			/*
			 * did_freep_after points to area in block after
			 * offset, length.
			 */
			did_freep_after = (mddb_did_free_t *)kmem_zalloc
			    (sizeof (mddb_did_free_t), KM_SLEEP);
			did_freep_after->free_blk = did_freep_before->free_blk;
			did_freep_after->free_offset = offset + length;
			did_freep_after->free_length = old_length - length -
			    did_freep_before->free_length;
			/*
			 * Add before and after areas to free list
			 * If area before or after offset, length has length
			 * of 0, that entry is not added.
			 */
			if (did_freep_after->free_length) {
				did_freep_after->free_next = did_freep1;
				if (did_freep2) {
					did_freep2->free_next =
					    did_freep_after;
				} else {
					s->s_did_icp->did_ic_freep =
					    did_freep_after;
				}
				did_freep1 = did_freep_after;
			} else {
				kmem_free(did_freep_after,
				    sizeof (mddb_did_free_t));
			}

			if (did_freep_before->free_length) {
				did_freep_before->free_next = did_freep1;
				if (did_freep2) {
					did_freep2->free_next =
					    did_freep_before;
				} else {
					s->s_did_icp->did_ic_freep =
					    did_freep_before;
				}
			} else {
				kmem_free(did_freep_before,
				    sizeof (mddb_did_free_t));
			}
			break;
		} else {
			did_freep2 = did_freep1;
			did_freep1 = did_freep1->free_next;
		}
	}
	if (block_found == 0) {
		return (1);
	} else {
		return (0);
	}
}

/*
 * Find free space of devid length and remove free space from list.
 * Return a pointer to the previously free area.
 *
 * If there's not enough free space on the free list, get an empty
 * disk block, put the empty disk block on the did_ic_dbp linked list,
 * and add the disk block space not used for devid to the free list.
 *
 * Return pointer to address (inside disk block) of free area for devid.
 * Return 0 if error.
 */
static caddr_t
mddb_devid_free_get(
	mddb_set_t *s,
	uint_t len,
	uint_t *blk,
	uint_t *cnt,
	uint_t *offset
)
{
	mddb_did_free_t	*freep, *freep2;
	mddb_did_db_t	*dbp;
	uint_t		blk_cnt, blk_num;
	ddi_devid_t	devid_ptr = NULL;

	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
		return (0);
	}

	freep = s->s_did_icp->did_ic_freep;
	freep2 = (mddb_did_free_t *)NULL;
	while (freep) {
		/* found a free area - remove from free list */
		if (len <= freep->free_length) {
			*blk = freep->free_blk;
			*offset = freep->free_offset;
			/* find disk block pointer that contains free area */
			dbp = s->s_did_icp->did_ic_dbp;
			while (dbp) {
				if (dbp->db_firstblk == *blk)
					break;
				else
					dbp = dbp->db_next;
			}
			/*
			 * If a disk block pointer can't be found - something
			 * is wrong, so don't use this free space.
			 */
			if (dbp == NULL) {
				freep2 = freep;
				freep = freep->free_next;
				continue;
			}

			devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset);
			*cnt = dbp->db_blkcnt;

			/* Update free list information */
			freep->free_offset += len;
			freep->free_length -= len;
			if (freep->free_length == 0) {
				if (freep2) {
					freep2->free_next =
					    freep->free_next;
				} else {
					s->s_did_icp->did_ic_freep =
					    freep->free_next;
				}
				kmem_free(freep, sizeof (mddb_did_free_t));
			}
			break;
		}
		freep2 = freep;
		freep = freep->free_next;
	}

	/* Didn't find a free spot */
	if (freep == NULL) {
		/* get free logical disk blk in replica */
		blk_cnt = btodb(len + (MDDB_BSIZE - 1));
		blk_num = getfreeblks(s, blk_cnt);
		if (blk_num == 0)
			return (0);

		/* Add disk block to disk block linked list */
		dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP);
		dbp->db_firstblk = blk_num;
		dbp->db_blkcnt = blk_cnt;
		dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP);
		dbp->db_next = s->s_did_icp->did_ic_dbp;
		s->s_did_icp->did_ic_dbp = dbp;
		devid_ptr = (ddi_devid_t)dbp->db_ptr;

		/* Update return values */
		*blk = blk_num;
		*offset = 0;
		*cnt = blk_cnt;

		/* Add unused part of block to free list */
		(void) mddb_devid_free_add(s, blk_num,
		    len, (dbtob(blk_cnt) - len));
	}

	return ((caddr_t)devid_ptr);
}

/*
 * Add device id information for locator index to device id area in set.
 * Get free area to store device id from free list.   Update checksum
 * for mddb_did_blk.
 *
 * This routine does not write any data out to disk.
 * After this routine has been called, the routine, writelocall, should
 * be called to write both the locator block and device id area out
 * to disk.
 */
static int
mddb_devid_add(
	mddb_set_t	*s,
	uint_t		index,
	ddi_devid_t	devid,
	char		*minor_name
)
{
	uint_t		devid_len;
	uint_t		blk, offset;
	ddi_devid_t	devid_ptr;
	mddb_did_info_t	*did_info;
	uint_t		blkcnt, i;
	mddb_did_blk_t	*did_blk;

	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
		return (1);
	}
	if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1))
		return (1);

	/* Check if device id has already been added */
	did_blk = s->s_did_icp->did_ic_blkp;
	did_info = &(did_blk->blk_info[index]);
	if (did_info->info_flags & MDDB_DID_EXISTS)
		return (0);

	devid_len = ddi_devid_sizeof(devid);
	devid_ptr = (ddi_devid_t)mddb_devid_free_get(s,
	    devid_len, &blk, &blkcnt, &offset);

	if (devid_ptr == NULL) {
		return (1);
	}

	/* Copy devid into devid free area */
	for (i = 0; i < devid_len; i++)
		((char *)devid_ptr)[i] = ((char *)devid)[i];

	/* Update mddb_did_info area for new device id */
	did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID;

	/*
	 * Only set UPDATED flag for non-replicated import cases.
	 * This allows the side locator driver name index to get
	 * updated in load_old_replicas.
	 */
	if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT))
		did_info->info_flags |= MDDB_DID_UPDATED;

	did_info->info_firstblk = blk;
	did_info->info_blkcnt = blkcnt;
	did_info->info_offset = offset;
	did_info->info_length = devid_len;
	(void) strcpy(did_info->info_minor_name, minor_name);
	crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL);

	/* Add device id pointer to did_ic_devid array */
	s->s_did_icp->did_ic_devid[index] = devid_ptr;

	return (0);
}


/*
 * Delete device id information for locator index from device id area in set.
 * Add device id space to free area.
 *
 * This routine does not write any data out to disk.
 * After this routine has been called, the routine, writelocall, should
 * be called to write both the locator block and device id area out
 * to disk.
 */
static int
mddb_devid_delete(mddb_set_t *s, uint_t index)
{
	mddb_did_info_t	*did_info;
	mddb_did_blk_t	*did_blk;

	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
		return (1);
	}

	/* Get device id information from mddb_did_blk */
	did_blk = s->s_did_icp->did_ic_blkp;
	did_info = &(did_blk->blk_info[index]);

	/*
	 * Ensure that the underlying device supports device ids
	 * before arbitrarily removing them.
	 */
	if (!(did_info->info_flags & MDDB_DID_EXISTS)) {
		return (1);
	}

	/* Remove device id information from mddb_did_blk */
	did_info->info_flags = 0;

	/* Remove device id from incore area */
	s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL;

	/* Add new free space in disk block to free list */
	(void) mddb_devid_free_add(s, did_info->info_firstblk,
	    did_info->info_offset, did_info->info_length);

	return (0);
}

/*
 * Check if there is a device id for a locator index.
 *
 * Caller of this routine should not free devid or minor_name since
 * these will point to internal data structures that should not
 * be freed.
 */
static int
mddb_devid_get(
	mddb_set_t *s,
	uint_t index,
	ddi_devid_t *devid,
	char **minor_name
)
{
	mddb_did_info_t	*did_info;

	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
		return (0);
	}
	did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]);

	if (did_info->info_flags & MDDB_DID_EXISTS) {
		*devid = s->s_did_icp->did_ic_devid[index];
		*minor_name =
		    s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name;
		return (1);
	} else
		return (0);


}

/*
 * Check if device id is valid on current system.
 * Needs devid, previously known dev_t and current minor_name.
 *
 * Success:
 * 	Returns 0 if valid device id is found and updates
 * 	dev_t if the dev_t associated with the device id is
 *	different than dev_t.
 * Failure:
 * 	Returns 1 if device id not valid on current system.
 */
static int
mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name)
{
	int		retndevs;
	dev_t		*ddi_devs;
	int		devid_flag = 0;
	int 		cnt;

	if (dev == 0)
		return (1);
	/*
	 * See if devid is valid in the current system.
	 * If so, set dev to match the devid.
	 */
	if (ddi_lyr_devid_to_devlist(devid, minor_name,
	    &retndevs, &ddi_devs) == DDI_SUCCESS) {
		if (retndevs > 0) {
			/* devid is valid to use */
			devid_flag = 1;
			/* does dev_t in list match dev */
			cnt = 0;
			while (cnt < retndevs) {
				if (*dev == md_expldev(ddi_devs[cnt]))
					break;
				cnt++;
			}
			/*
			 * If a different dev_t, then setup
			 * new dev and new major name
			 */
			if (cnt == retndevs) {
				*dev = md_expldev(ddi_devs[0]);
			}
			ddi_lyr_free_devlist(ddi_devs, retndevs);
		}
	}
	if (devid_flag)
		return (0);
	else
		return (1);
}


/*
 * Free the devid incore data areas
 */
static void
mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp)
{
	mddb_did_free_t	*did_freep1, *did_freep2;
	mddb_did_db_t	*did_dbp1, *did_dbp2;
	mddb_did_ic_t	*icp = *did_icp;

	if (icp) {
		if (icp->did_ic_blkp) {
			kmem_free((caddr_t)icp->did_ic_blkp,
			    dbtob(lbp->lb_didblkcnt));
			icp->did_ic_blkp = (mddb_did_blk_t *)NULL;
		}

		if (icp->did_ic_dbp) {
			did_dbp1 = icp->did_ic_dbp;
			while (did_dbp1) {
				did_dbp2 = did_dbp1->db_next;
				kmem_free((caddr_t)did_dbp1->db_ptr,
				    dbtob(did_dbp1->db_blkcnt));
				kmem_free((caddr_t)did_dbp1,
				    sizeof (mddb_did_db_t));
				did_dbp1 = did_dbp2;
			}
		}

		if (icp->did_ic_freep) {
			did_freep1 = icp->did_ic_freep;
			while (did_freep1) {
				did_freep2 = did_freep1->free_next;
				kmem_free((caddr_t)did_freep1,
				    sizeof (mddb_did_free_t));
				did_freep1 = did_freep2;
			}
		}

		kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t));
		*did_icp = (mddb_did_ic_t *)NULL;
	}

}

static daddr_t
getphysblk(
	mddb_block_t		blk,
	mddb_mb_ic_t		*mbip
)
{
	mddb_mb_t	*mbp = &(mbip->mbi_mddb_mb);

	while (blk >= mbp->mb_blkcnt) {
		if (! mbip->mbi_next)
			return ((daddr_t)-1);	/* no such block */
		blk -= mbp->mb_blkcnt;
		mbip = mbip->mbi_next;
		mbp = &(mbip->mbi_mddb_mb);
	}

	if (blk >= mbp->mb_blkmap.m_consecutive)
		return ((daddr_t)-1);	/* no such block */

	return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk));
}

/*
 * when a buf header is passed in the new buffer must be
 * put on the front of the chain. writerec counts on it
 */
static int
putblks(
	mddb_set_t	*s,		/* incore db set structure */
	caddr_t		buffer,		/* adr of buffer to be written */
	daddr_t		blk,		/* block number for first block */
	int		cnt,		/* number of blocks to be written */
	md_dev64_t	device,		/* device to be written to */
	mddb_bf_t	**bufhead	/* if non-zero then ASYNC I/O */
					/*    and put buf address here */
)
{
	buf_t		*bp;
	mddb_bf_t	*bfp;
	int		err = 0;

	bfp = allocbuffer(s, MDDB_SLEEPOK);
	bp = &bfp->bf_buf;
	bp->b_bcount = MDDB_BSIZE * cnt;
	bp->b_un.b_addr = buffer;
	bp->b_blkno = blk;
	bp->b_edev = md_dev64_to_dev(device);
	/*
	 * if a header for a buf chain is passed in this is async io.
	 * currently only done for optimize  records
	 */
	if (bufhead) {
		bfp->bf_next = *bufhead;
		*bufhead = bfp;
		(void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp);
		return (0);
	}
	err = mddb_rwdata(s, B_WRITE, bp);
	freebuffer(s, bfp);
	if (err) {
		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
		    s->s_setno, device);
		return (MDDB_F_EWRITE);
	}
	return (0);
}

/*
 * wrtblklst - takes an array of logical block numbers
 *		and writes the buffer to those blocks (scatter).
 * If called during upgrade, this routine expects a
 * non-translated (aka target) dev.
 */
static int
wrtblklst(
	mddb_set_t	*s,		/* incore set structure */
	caddr_t		buffer,		/* buffer to be written (record blk) */
	mddb_block_t	blka[],		/* list of logical blks for record */
	daddr_t		cnt,		/* number of logical blks */
	const int	li,		/* locator index */
	mddb_bf_t	**bufhead,	/* if non-zero then ASYNC I/O */
					/*    and put buf address here */
	int		master_only	/* allow only master node to write */
)
{
	daddr_t		blk;
	daddr_t		blk1;
	int		err = 0;
	int		cons;
	mddb_lb_t	*lbp = s->s_lbp;
	mddb_locator_t	*lp = &lbp->lb_locators[li];
	md_dev64_t	dev;
	mddb_mb_ic_t	*mbip = s->s_mbiarray[li];

	/*
	 * If a MN diskset and only the master can write,
	 * then a non-master node will just return success.
	 */
	if (lbp->lb_flags & MDDB_MNSET) {
		if (master_only == MDDB_WR_ONLY_MASTER) {
			/* return successfully if we aren't the master */
			if (!(md_set[s->s_setno].s_am_i_master)) {
				return (0);
			}
		}
		if (mbip == NULL)
			return (MDDB_F_EWRITE);
	}

	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
	if (dev == NODEV64) {
		return (1);
	}

	blk = getphysblk(blka[0], mbip);
	ASSERT(blk >= 0);

	cons = 1;
	while (cnt) {
		if (cons != cnt) {
			blk1 = getphysblk(blka[cons], mbip);
			ASSERT(blk1 >= 0);
			if ((blk + cons) == blk1) {
				cons++;
				continue;
			}
		}
		if (err = putblks(s, buffer, blk, cons, dev, bufhead)) {
			/*
			 * If an MN diskset and any_node_can_write
			 * then this request is coming from writeoptrecord
			 * and l_flags field should not be updated.
			 * l_flags will be updated as a result of sending
			 * a class1 message to the master.  Setting l_flags
			 * here will cause slave to be out of sync with
			 * master.
			 *
			 * Otherwise, set the error in l_flags
			 * (this occurs if this is not a MN diskset or
			 * only_master_can_write is set).
			 */
			if ((!(lbp->lb_flags & MDDB_MNSET)) ||
			    (master_only == MDDB_WR_ONLY_MASTER)) {
				lp->l_flags |= MDDB_F_EWRITE;
			}
			return (err);
		}
		if (bufhead)
			(*bufhead)->bf_locator = lp;

		buffer += MDDB_BSIZE * cons;
		cnt -= cons;
		blka += cons;
		if (cnt) {
			blk = getphysblk(blka[0], mbip);
			ASSERT(blk >= 0);
		}
		cons = 1;
	}

	return (0);
}

/*
 * writeblks - takes a logical block number/block count pair
 * 		and writes the buffer to those contiguous logical blocks.
 * If called during upgrade, this routine expects a non-translated
 * (aka target) dev.
 */
static int
writeblks(
	mddb_set_t	*s,		/* incore set structure */
	caddr_t		buffer,		/* buffer to be written */
	mddb_block_t	blk,		/* starting logical block number */
	int		cnt,		/* number of log blocks to be written */
	const int	li,		/* locator index */
	int		master_only	/* allow only master node to write */
)
{
	daddr_t		physblk;
	int		err = 0;
	int		i;
	mddb_lb_t	*lbp = s->s_lbp;
	mddb_locator_t	*lp = &lbp->lb_locators[li];
	md_dev64_t	dev;
	mddb_block_t	*blkarray;
	int		size;
	int		ret;

	/*
	 * If a MN diskset and only the master can write,
	 * then a non-master node will just return success.
	 */
	if ((lbp->lb_flags & MDDB_MNSET) &&
	    (master_only == MDDB_WR_ONLY_MASTER)) {
		/* return successfully if we aren't the master */
		if (!(md_set[s->s_setno].s_am_i_master)) {
			return (0);
		}
	}

	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
	if (dev == NODEV64) {
		return (1);
	}

	if (cnt > 1) {
		size = sizeof (mddb_block_t) * cnt;
		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
		for (i = 0; i < cnt; i++)
			blkarray[i] = blk + i;
		ret = wrtblklst(s, buffer, blkarray, cnt,
		    li, 0, MDDB_WR_ONLY_MASTER);
		kmem_free(blkarray, size);
		return (ret);
	}
	physblk = getphysblk(blk, s->s_mbiarray[li]);
	ASSERT(physblk > 0);
	if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) {
		lp->l_flags |= MDDB_F_EWRITE;
		return (err);
	}
	return (0);
}

/*
 * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas.
 */
static int
writeall(
	mddb_set_t	*s,		/* incore set structure */
	caddr_t		buffer,		/* buffer to be written */
	mddb_block_t	block,		/* starting logical block number */
	int		cnt,		/* number of log blocks to be written */
	int		master_only	/* allow only master node to write */
)
{
	int		li;
	int		err = 0;
	mddb_lb_t	*lbp = s->s_lbp;

	for (li = 0; li < lbp->lb_loccnt; li++) {
		mddb_locator_t	*lp = &lbp->lb_locators[li];

		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
		    (lp->l_flags & MDDB_F_EWRITE))
			continue;

		err |= writeblks(s, buffer, block, cnt, li, master_only);
	}

	return (err);
}

/*
 * writelocall - write the locator block and device id information (if
 * replica is in device id format) to all ACTIVE/NON-ERRORER replicas.
 *
 * Increments the locator block's commitcnt.  Updates the device id area's
 * commitcnt if the replica is in device id format.  Regenerates the
 * checksums after updating the commitcnt(s).
 */
static int
writelocall(
	mddb_set_t	*s	/* incore set structure */
)
{
	int		li;
	int		err = 0;
	mddb_lb_t	*lbp = s->s_lbp;
	mddb_did_blk_t	*did_blk;
	mddb_did_db_t	*did_dbp;

	s->s_lbp->lb_commitcnt++;
	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
		did_blk = s->s_did_icp->did_ic_blkp;
		did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt;
		crcgen(did_blk, &did_blk->blk_checksum,
		    dbtob(lbp->lb_didblkcnt), NULL);
	}
	crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);

	for (li = 0; li < lbp->lb_loccnt; li++) {
		mddb_locator_t	*lp = &lbp->lb_locators[li];

		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
		    (lp->l_flags & MDDB_F_EWRITE))
			continue;

		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
			/* write out blocks containing actual device ids */
			did_dbp = s->s_did_icp->did_ic_dbp;
			while (did_dbp) {
				err |= writeblks(s, (caddr_t)did_dbp->db_ptr,
				    did_dbp->db_firstblk,
				    did_dbp->db_blkcnt, li,
				    MDDB_WR_ONLY_MASTER);
				did_dbp = did_dbp->db_next;
			}

			/* write out device id area block */
			err |= writeblks(s, (caddr_t)did_blk,
			    lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
			    MDDB_WR_ONLY_MASTER);
		}
		/* write out locator block */
		err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
		    MDDB_WR_ONLY_MASTER);
	}

	/*
	 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag
	 * in the mddb_set structure to show that the locator block has
	 * been changed.
	 */

	if ((lbp->lb_flags & MDDB_MNSET) &&
	    (md_set[s->s_setno].s_am_i_master)) {
		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
	}
	return (err);
}

/*
 * If called during upgrade, this routine expects a translated
 * (aka miniroot) dev.
 */
static int
getblks(
	mddb_set_t	*s,	/* incore db set structure */
	caddr_t		buffer,	/* buffer to read data into */
	md_dev64_t	device,	/* device to read from */
	daddr_t		blk,	/* physical block number to read */
	int		cnt,	/* number of blocks to read */
	int		flag	/* flags for I/O */
)
{
	buf_t		*bp;
	mddb_bf_t	*bfp;
	int		err = 0;

	bfp = allocbuffer(s, MDDB_SLEEPOK);	/* this will never sleep */
	bp = &bfp->bf_buf;
	bp->b_bcount = MDDB_BSIZE * cnt;
	bp->b_un.b_addr = buffer;
	bp->b_blkno = blk;
	bp->b_edev = md_dev64_to_dev(device);
	err = mddb_rwdata(s, (B_READ | flag), bp);
	freebuffer(s, bfp);
	if (err) {
		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
		    s->s_setno, device);
		return (MDDB_F_EREAD);
	}
	return (0);
}

/*
 * readblklst - takes an array of logical block numbers
 * 		and reads those blocks (gather) into the buffer.
 * If called during upgrade, this routine expects a non-translated
 * (aka target) dev.
 */
static int
readblklst(
	mddb_set_t	*s,	/* incore set structure */
	caddr_t		buffer,	/* buffer to be read (record block) */
	mddb_block_t	blka[],	/* list of logical blocks to be read */
	daddr_t		cnt,	/* number of logical blocks */
	int		li,	/* locator index */
	int		flag	/* flags for I/O */
)
{
	daddr_t		blk;
	daddr_t		blk1;
	int		err = 0;
	int		cons;
	md_dev64_t	dev;
	mddb_mb_ic_t	*mbip;

	mbip = s->s_mbiarray[li];
	dev = md_expldev(s->s_lbp->lb_locators[li].l_dev);
	dev = md_xlate_targ_2_mini(dev);
	if (dev == NODEV64) {
		return (1);
	}

	blk = getphysblk(blka[0], mbip);
	ASSERT(blk >= 0);

	cons = 1;
	while (cnt) {
		if (cons != cnt) {
			blk1 = getphysblk(blka[cons], mbip);
			ASSERT(blk1 >= 0);
			if ((blk + cons) == blk1) {
				cons++;
				continue;
			}
		}
		if (err = getblks(s, buffer, dev, blk, cons, flag))
			return (err);
		buffer += MDDB_BSIZE * cons;
		cnt -= cons;
		blka += cons;
		if (cnt) {
			blk = getphysblk(blka[0], mbip);
			ASSERT(blk >= 0);
		}
		cons = 1;
	}
	return (0);
}

/*
 * readblks - takes a logical block number/block count pair
 * 		and reads those contiguous logical blocks into the buffer.
 * If called during upgrade, this routine expects a non-translated
 * (aka target) dev.
 */
static int
readblks(
	mddb_set_t	*s,	/* incore set structure */
	caddr_t		buffer,	/* buffer to be read into */
	mddb_block_t	blk,	/* logical block number to be read */
	int		cnt,	/* number of logical blocks to be read */
	int		li	/* locator index */
)
{
	daddr_t		physblk;
	md_dev64_t	device;
	int		i;
	mddb_block_t	*blkarray;
	int		size;
	int		ret;

	if (cnt > 1) {
		size = sizeof (mddb_block_t) * cnt;
		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
		for (i = 0; i < cnt; i++)
			blkarray[i] = blk + i;
		ret = readblklst(s, buffer, blkarray, cnt, li, 0);
		kmem_free(blkarray, size);
		return (ret);
	}
	physblk = getphysblk(blk, s->s_mbiarray[li]);
	ASSERT(physblk > 0);
	device = md_expldev(s->s_lbp->lb_locators[li].l_dev);
	device = md_xlate_targ_2_mini(device);
	if (device == NODEV64) {
		return (1);
	}
	return (getblks(s, buffer, device, physblk, 1, 0));
}

static void
single_thread_start(
	mddb_set_t	*s
)
{
	while (s->s_singlelockgotten) {
		s->s_singlelockwanted++;
		cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno));
	}
	s->s_singlelockgotten++;
}

static void
single_thread_end(
	mddb_set_t	*s
)
{
	ASSERT(s->s_singlelockgotten);
	s->s_singlelockgotten = 0;
	if (s->s_singlelockwanted) {
		s->s_singlelockwanted = 0;
		cv_broadcast(&s->s_single_thread_cv);
	}
}

static size_t
sizeofde(
	mddb_de_ic_t	*dep
)
{
	size_t		size;

	size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) +
	    sizeof (mddb_block_t) * dep->de_blkcount;
	return (size);
}

static size_t
sizeofde32(
	mddb_de32_t	*dep
)
{
	size_t		size;

	size = sizeof (*dep) - sizeof (dep->de32_blks) +
	    sizeof (mddb_block_t) * dep->de32_blkcount;
	return (size);
}

static mddb_de32_t *
nextentry(
	mddb_de32_t	*dep
)
{
	mddb_de32_t	*ret;

	ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep)));
	return (ret);
}

static void
create_db32rec(
	mddb_db32_t *db32p,
	mddb_db_t *dbp
)
{
	mddb_de_ic_t *dep;
	mddb_de32_t *de32p;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
#endif

	dbtodb32(dbp, db32p);
	if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0))
		db32p->db32_firstentry = 0x4;
	de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry)
	    + sizeof (db32p->db32_firstentry)));
	for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
		detode32(dep, de32p);
		if ((dep->de_next != NULL) && (de32p->de32_next == 0))
			de32p->de32_next = 0x4;
		de32p = nextentry(de32p);
	}
	ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE);
}

/*
 * If called during upgrade, this routine expects a translated
 * (aka miniroot) dev.
 * If master blocks are found, set the mn_set parameter to 1 if the
 * the master block revision number is MDDB_REV_MNMB; otherwise,
 * set it to 0.
 * If master blocks are not found, do not change the mnset parameter.
 */
static mddb_mb_ic_t *
getmasters(
	mddb_set_t	*s,
	md_dev64_t	dev,
	daddr_t		blkno,
	uint_t		*flag,
	int		*mn_set
)
{
	mddb_mb_ic_t	*mbi = NULL;
	mddb_mb_t	*mb;
	int		error = 0;
	ddi_devid_t	devid;


	if (mddb_devopen(dev)) {
		if (flag)
			*flag |= MDDB_F_EMASTER;
		return ((mddb_mb_ic_t *)NULL);
	}


	mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP);
	mb = &(mbi->mbi_mddb_mb);
	if (error = getblks(s, (caddr_t)mb, dev, blkno,
	    btodb(MDDB_BSIZE), 0)) {
		error |= MDDB_F_EMASTER;
	}
	if (mb->mb_magic != MDDB_MAGIC_MB) {
		error = MDDB_F_EFMT | MDDB_F_EMASTER;
	}
	/* Check for MDDB_REV_MNMB and lower */
	if (revchk(MDDB_REV_MNMB, mb->mb_revision)) {
		error = MDDB_F_EFMT | MDDB_F_EMASTER;
	}
	if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) {
		error = MDDB_F_EFMT | MDDB_F_EMASTER;
	}

	if (!(md_get_setstatus(s->s_setno) &
	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
	    (mb->mb_setno != s->s_setno)) {
		error = MDDB_F_EFMT | MDDB_F_EMASTER;
	}
	if (mb->mb_blkno != blkno) {
		error = MDDB_F_EFMT | MDDB_F_EMASTER;
	}
	mb->mb_next = NULL;
	mbi->mbi_next = NULL;

	if (error)
		goto out;

	/*
	 * Check the md_devid_destroy and md_keep_repl_state flags
	 * to see if we need to regen the devid or not.
	 *
	 * Don't care about devid in local set since it is not used
	 * and this should not be part of set importing
	 */
	if ((s->s_setno != MD_LOCAL_SET) &&
	    !(md_get_setstatus(s->s_setno) &
	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) {
		/*
		 * Now check the destroy flag. We also need to handle
		 * the case where the destroy flag is reset after the
		 * destroy
		 */
		if (md_devid_destroy || (mb->mb_devid_len == 0)) {

			if (md_devid_destroy) {
				bzero(mb->mb_devid, mb->mb_devid_len);
				mb->mb_devid_len = 0;
			}

			/*
			 * Try to regenerate it if the 'keep' flag is not set
			 */
			if (!md_keep_repl_state) {
				if (ddi_lyr_get_devid(md_dev64_to_dev(dev),
				    &devid) == DDI_SUCCESS) {
					mb->mb_devid_len =
					    ddi_devid_sizeof(devid);
					bcopy(devid, mb->mb_devid,
					    mb->mb_devid_len);
					ddi_devid_free(devid);
				} else {
					error = MDDB_F_EFMT | MDDB_F_EMASTER;
				}
			}

			crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);

			/*
			 * Push
			 */
			if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) {
				error = MDDB_F_EFMT | MDDB_F_EMASTER;
			}
		}
	}

	if (! error) {
		/* Set mn_set parameter to 1 if a MN set */
		if (mb->mb_revision == MDDB_REV_MNMB)
			*mn_set = 1;
		else
			*mn_set = 0;
		return (mbi);
	}

out:
	/* Error Out */
	if (flag)
		*flag |= error;

	kmem_free((caddr_t)mbi, MDDB_IC_BSIZE);
	mddb_devclose(dev);
	return ((mddb_mb_ic_t *)NULL);
}

static int
getrecord(
	mddb_set_t	*s,
	mddb_de_ic_t	*dep,
	int		li
)
{
	int		err = 0;
	mddb_rb32_t	*rbp;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif


	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP);
	rbp = dep->de_rb;

	err = readblklst(s, (caddr_t)rbp, dep->de_blks,
	    dep->de_blkcount, li, 0);
	if (err) {
		return (MDDB_F_EDATA | err);
	}
	if (rbp->rb_magic != MDDB_MAGIC_RB) {
		return (MDDB_F_EFMT | MDDB_F_EDATA);
	}
	if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) &&
	    (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) &&
	    (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) &&
	    (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) {
		return (MDDB_F_EFMT | MDDB_F_EDATA);
	}
	/* Check crc for this record */
	if (rec_crcchk(s, dep, rbp)) {
		return (MDDB_F_EFMT | MDDB_F_EDATA);
	}
	return (0);
}

/*
 * Code to read in the locator name information
 */
static int
readlocnames(
	mddb_set_t	*s,
	int		li
)
{
	mddb_ln_t	*lnp;
	int		err = 0;
	mddb_block_t	ln_blkcnt, ln_blkno;

	/*
	 * read in the locator name blocks
	 */
	s->s_lnp = NULL;

	ln_blkno = s->s_lbp->lb_lnfirstblk;
	ln_blkcnt = s->s_lbp->lb_lnblkcnt;
	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP);

	err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li);
	if (err) {
		err |= MDDB_F_EDATA;
		goto out;
	}
	if (lnp->ln_magic != MDDB_MAGIC_LN) {
		err = MDDB_F_EDATA | MDDB_F_EFMT;
		goto out;
	}
	if (s->s_lbp->lb_flags & MDDB_MNSET) {
		if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) {
			err = MDDB_F_EDATA | MDDB_F_EFMT;
			goto out;
		}
	} else {
		if (revchk(MDDB_REV_LN, lnp->ln_revision)) {
			err = MDDB_F_EDATA | MDDB_F_EFMT;
			goto out;
		}
	}
	if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) {
		err = MDDB_F_EDATA | MDDB_F_EFMT;
		goto out;
	}
out:
	/*
	 *	if error occurred in locator name blocks free them
	 *	and return
	 */
	if (err) {
		kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
		return (err);
	}
	s->s_lnp = lnp;
	return (0);
}

/*
 * code to read in a copy of the database.
 */

static int
readcopy(
	mddb_set_t	*s,
	int		li
)
{
	uint_t		blk;
	mddb_db_t	*dbp, *dbp1, *dbhp;
	mddb_db32_t	*db32p;
	mddb_de_ic_t	*dep, *dep2;
	mddb_de32_t	*de32p, *de32p2;
	int		err = 0;
	uint_t		checksum;


#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
#endif

	dbp = NULL;
	dbhp = NULL;
	/*
	 *	read in all the directory blocks
	 */
	blk = s->s_lbp->lb_dbfirstblk;
	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);

	for (; blk != 0; blk = dbp->db_nextblk) {
		dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
		if (! dbhp) {
			dbhp = dbp1;
		} else {
			dbp->db_next = dbp1;
		}
		dbp = dbp1;

		err = readblks(s, (caddr_t)db32p, blk, 1, li);
		if (err) {
			err |= MDDB_F_EDATA;
			break;
		}
		db32todb(db32p, dbp);
		if (db32p->db32_magic != MDDB_MAGIC_DB) {
			err = MDDB_F_EDATA | MDDB_F_EFMT;
			break;
		}
		if (revchk(MDDB_REV_DB, db32p->db32_revision)) {
			err = MDDB_F_EDATA | MDDB_F_EFMT;
			break;
		}
		if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) {
			err = MDDB_F_EDATA | MDDB_F_EFMT;
			break;
		}
		/*
		 * first go through and fix up all de_next pointers
		 */
		if (dbp->db_firstentry) {

			de32p = (mddb_de32_t *)
			    ((void *) ((caddr_t)(&db32p->db32_firstentry)
			    + sizeof (db32p->db32_firstentry)));

			dep = (mddb_de_ic_t *)
			    kmem_zalloc(sizeof (mddb_de_ic_t) -
			    sizeof (mddb_block_t) +
			    sizeof (mddb_block_t) * de32p->de32_blkcount,
			    KM_SLEEP);
			de32tode(de32p, dep);

			dbp->db_firstentry = dep;
			while (de32p && de32p->de32_next) {

				de32p2 = nextentry(de32p);

				dep2 = (mddb_de_ic_t *)kmem_zalloc(
				    sizeof (mddb_de_ic_t) -
				    sizeof (mddb_block_t) +
				    sizeof (mddb_block_t) *
				    de32p2->de32_blkcount, KM_SLEEP);

				de32tode(de32p2, dep2);

				dep->de_next = dep2;
				dep = dep2;
				de32p = de32p2;
			}
		}
		/*
		 * go through and make all of the pointer to record blocks
		 * are null;
		 */
		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
			dep->de_rb = NULL;
	}
	kmem_free((caddr_t)db32p, MDDB_BSIZE);
	dbp->db_next = NULL;
	/*
	 *	if error occurred in directory blocks free them
	 *	and return
	 */
	if (err) {
		dbp = dbhp;
		while (dbp) {
			dep = dbp->db_firstentry;
			while (dep) {
				/* No mddb_rb32_t structures yet */
				dep2 = dep->de_next;
				kmem_free((caddr_t)dep, sizeofde(dep));
				dep = dep2;
			}
			dbp1 = dbp->db_next;
			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
			dbp = dbp1;
		}
		s->s_dbp = NULL;
		return (err);

	}
	/*
	 */
	err = 0;
	checksum = MDDB_GLOBAL_XOR;
	for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) {
		checksum ^= dbp->db_recsum;
		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
			if (dep->de_flags & MDDB_F_OPT)
				continue;
			err = getrecord(s, dep, li);
			if (err)
				break;
			/* Don't include CHANGELOG in big XOR */
			if (dep->de_flags & MDDB_F_CHANGELOG)
				continue;
			checksum ^= dep->de_rb->rb_checksum;
			checksum ^= dep->de_rb->rb_checksum_fiddle;
		}
		if (err)
			break;
	}
	if (checksum) {
		if (! err)
			err = MDDB_F_EDATA | MDDB_F_EFMT;
	}
	if (err) {
		dbp = dbhp;
		dbhp = NULL;
		while (dbp) {
			dep = dbp->db_firstentry;
			while (dep) {
				if (dep->de_rb)
					kmem_free((caddr_t)dep->de_rb,
					    dep->de_recsize);
				dep2 = dep->de_next;
				kmem_free((caddr_t)dep, sizeofde(dep));
				dep = dep2;
			}
			dbp1 = dbp->db_next;
			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
			dbp = dbp1;
		}
	}
	s->s_dbp = dbhp;
	return (err);
}

static int
getoptcnt(
	mddb_set_t	*s,
	int		li)
{
	int		result;
	mddb_de_ic_t	*dep;
	mddb_db_t	*dbp;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
#endif

	result = 0;
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		dep = dbp->db_firstentry;
		for (; dep != NULL; dep = dep->de_next) {
			if (! (dep->de_flags & MDDB_F_OPT))
				continue;
			if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) &&
			    (li == dep->de_optinfo[0].o_li)) ||
			    ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) &&
			    (li == dep->de_optinfo[1].o_li)))
			result++;
		}
	}
	return (result);
}

static void
getoptdev(
	mddb_set_t	*s,
	mddb_de_ic_t	*rdep,
	int		opti
)
{
	mddb_lb_t	*lbp;
	mddb_locator_t	*lp;
	mddb_optinfo_t	*otherop;
	mddb_optinfo_t	*resultop;
	int		li;
	dev_t		otherdev;
	int		blkonly = 0;
	int		mincnt;
	int		thiscnt;

	lbp = s->s_lbp;

	resultop = &rdep->de_optinfo[opti];
	otherop = &rdep->de_optinfo[1-opti];

	resultop->o_flags = 0;

	/*
	 * scan through and see if data bases have to vary by only device
	 */

	if (otherop->o_flags & MDDB_F_ACTIVE) {
		blkonly = 1;
		otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev);
		for (li = 0; li < lbp->lb_loccnt; li++) {
			lp = &lbp->lb_locators[li];
			if (! (lp->l_flags & MDDB_F_ACTIVE))
				continue;
			if (expldev(lp->l_dev) != otherdev) {
				blkonly = 0;
				break;
			}
		}
	}

	mincnt = 999999;
	for (li = 0; li < lbp->lb_loccnt; li++) {
		dev_info_t	*devi;
		int		removable = 0;

		lp = &lbp->lb_locators[li];
		if (! (lp->l_flags & MDDB_F_ACTIVE))
			continue;
		if (otherop->o_flags & MDDB_F_ACTIVE) {
			if (blkonly) {
				if (otherop->o_li == li)
					continue;
			} else {
				if (otherdev == expldev(lp->l_dev))
					continue;
			}
		}

		/*
		 * Check if this is a removable device.  If it is we
		 * assume it is something like a USB flash disk, a zip disk
		 * or even a floppy that is being used to help maintain
		 * mddb quorum.  We don't want to put any optimized resync
		 * records on these kinds of disks since they are usually
		 * slower or don't have the same read/write lifetimes as
		 * a regular fixed disk.
		 */
		if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) {
			int		error;
			struct cb_ops	*cb;
			ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
			int		propvalue = 0;
			int		proplength = sizeof (int);

			if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops)
			    != NULL) {
				error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi,
				    prop_op, DDI_PROP_NOTPROM |
				    DDI_PROP_DONTPASS, "removable-media",
				    (caddr_t)&propvalue, &proplength);

				if (error == DDI_PROP_SUCCESS)
					removable = 1;
			}

			ddi_release_devi(devi);
		}

		if (removable)
			continue;

		thiscnt = getoptcnt(s, li);
		if (thiscnt < mincnt) {
			resultop->o_li  = li;
			mincnt = thiscnt;
			resultop->o_flags = MDDB_F_ACTIVE;
		}
	}
}

static void
allocuserdata(
	mddb_de_ic_t	*dep
)
{
	mddb_rb32_t	*rbp;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

	rbp = dep->de_rb;
	rbp->rb_private = 0;
	dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP);
	rbp->rb_userdata = 0x4;	/* Make sure this is non-zero */
	bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize);
}


static void
getuserdata(
	set_t		setno,
	mddb_de_ic_t	*dep
)
{
	mddb_rb32_t	 *rbp;


	mddb_type_t	type = dep->de_type1;
	caddr_t		data, udata;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif
	rbp = dep->de_rb;
	data = (caddr_t)rbp->rb_data;
	udata = (caddr_t)dep->de_rb_userdata;

	/*
	 * If it's a driver record, and an old style record, and not a DRL
	 * record, we must convert it because it was incore as a 64 bit
	 * structure but its on disk layout has only 32 bit for block sizes
	 */
	if (!(md_get_setstatus(setno) &
	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
	    (type >= MDDB_FIRST_MODID) &&
	    ((rbp->rb_revision == MDDB_REV_RB) ||
	    (rbp->rb_revision == MDDB_REV_RBFN))) {

		switch (dep->de_flags) {

			case MDDB_F_STRIPE:
				stripe_convert(data, udata, BIG_2_SMALL);
				break;

			case MDDB_F_MIRROR:
				mirror_convert(data, udata, BIG_2_SMALL);
				break;

			case MDDB_F_RAID:
				raid_convert(data, udata, BIG_2_SMALL);
				break;

			case MDDB_F_SOFTPART:
				softpart_convert(data, udata, BIG_2_SMALL);
				break;

			case MDDB_F_TRANS_MASTER:
				trans_master_convert(data, udata, BIG_2_SMALL);
				break;

			case MDDB_F_TRANS_LOG:
				trans_log_convert(data, udata, BIG_2_SMALL);
				break;

			case MDDB_F_HOTSPARE:
				hs_convert(data, udata, BIG_2_SMALL);
				break;

			case MDDB_F_OPT:
			default:
				bcopy(udata, data, dep->de_reqsize);
		}
	} else {
		bcopy(udata, data, dep->de_reqsize);
	}
}

static void
getoptrecord(
	mddb_set_t	*s,
	mddb_de_ic_t	*dep
)
{
	mddb_lb_t	*lbp;
	mddb_locator_t	*lp;
	mddb_rb32_t	*rbp, *crbp;
	int		li;
	int		i;
	int		err = 0;
	size_t		recsize;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

	lbp = s->s_lbp;

	recsize = dep->de_recsize;
	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
	rbp = dep->de_rb;
	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);

	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;

	for (i = 0; i < 2; i++) {
		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
			continue;
		li = dep->de_optinfo[i].o_li;
		lp = &lbp->lb_locators[li];

		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
		    (lp->l_flags & MDDB_F_EMASTER))
			continue;

		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
		    dep->de_blkcount, li, 0);

		if (err)
			continue;

		if (rbp->rb_magic != MDDB_MAGIC_RB)
			continue;

		if (revchk(MDDB_REV_RB, rbp->rb_revision))
			continue;

		/* Check the crc for this record */
		if (rec_crcchk(s, dep, rbp)) {
			continue;
		}

		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;

		if (rbp == crbp) {
			if (rbp->rb_checksum != crbp->rb_checksum)
				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
			break;
		}
		rbp = crbp;
	}

	if (rbp == crbp) {
		rbp->rb_private = 0;
		kmem_free((caddr_t)crbp, recsize);
		return;
	}
	bzero((caddr_t)rbp, recsize);
	rbp->rb_magic = MDDB_MAGIC_RB;
	rbp->rb_revision = MDDB_REV_RB;
	uniqtime32(&rbp->rb_timestamp);
	/* Generate the crc for this record */
	rec_crcgen(s, dep, rbp);
	kmem_free((caddr_t)crbp, recsize);
}

/*
 * writeoptrecord writes out an optimized record.
 */
static int
writeoptrecord(
	mddb_set_t	*s,
	mddb_de_ic_t	*dep
)
{
	mddb_rb32_t	*rbp;
	int		li;
	int		err = 0, wrt_err = 0;
	mddb_bf_t	*bufhead, *bfp;
	mddb_lb_t	*lbp = s->s_lbp;
	mddb_locator_t	*lp;
	int		i;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

	bufhead = NULL;
	err = 0;

	while (s->s_opthavequeuinglck) {
		s->s_optwantqueuinglck++;
		cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno));
	}
	s->s_opthavequeuinglck++;
	rbp = dep->de_rb;
	for (i = 0; i < 2; i++) {
		/*
		 * only possible error is xlate. This can
		 * occur if a replica was off line and came
		 * back. During the mean time the database grew
		 * large than the now on line replica can store
		 */
		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
			continue;
		li = dep->de_optinfo[i].o_li;
		/*
		 * In a MN diskset, any node can write optimized record(s).
		 */
		wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
		    dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
		/*
		 * For MN diskset, set error in optinfo structure so
		 * that mddb_commitrec knows which replica failed.
		 */
		if ((MD_MNSET_SETNO(s->s_setno)) &&
		    (wrt_err & MDDB_F_EWRITE)) {
			dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE;
		}
		err |= wrt_err;
	}
	s->s_opthavequeuinglck = 0;
	if (s->s_optwantqueuinglck) {
		s->s_optwantqueuinglck = 0;
		cv_broadcast(&s->s_optqueuing_cv);
	}
	for (bfp = bufhead; bfp; bfp = bufhead) {
		mutex_exit(SETMUTEX(s->s_setno));
		(void) biowait(&bfp->bf_buf);
		mutex_enter(SETMUTEX(s->s_setno));
		if (bfp->bf_buf.b_flags & B_ERROR) {
			/*
			 * If an MN diskset, don't set replica
			 * in error since this hasn't been set in master.
			 * Setting replica in error before master could
			 * leave the nodes with different views of the
			 * world since a class 1 configuration change
			 * could occur in mddb_commitrec as soon as
			 * all locks are dropped.  Must keep this
			 * node the same as master and can't afford a
			 * failure from the class 1 config change
			 * if master succeeded.
			 */
			if (!(MD_MNSET_SETNO(s->s_setno))) {
				bfp->bf_locator->l_flags |= MDDB_F_EWRITE;
			} else {
				/*
				 * Find which de_optinfo (which replica)
				 * had a failure and set the failure in
				 * the o_flags field.
				 */
				lp = &lbp->lb_locators[dep->de_optinfo[0].o_li];
				if (lp == bfp->bf_locator) {
					dep->de_optinfo[0].o_flags |=
					    MDDB_F_EWRITE;
				} else {
					dep->de_optinfo[1].o_flags |=
					    MDDB_F_EWRITE;
				}
			}
			err |= MDDB_F_EWRITE;
		}
		bufhead = bfp->bf_next;
		freebuffer(s, bfp);
	}
	return (err);
}

/*
 * Fix up the optimized resync record.  Used in the traditional and local
 * disksets to move an optimized record from a failed or deleted mddb
 * to an active one.
 *
 * In a MN diskset, the fixing of the optimized record is split between
 * the master and slave nodes.  If the master node moves the optimized
 * resync record, then the master node will send a MDDB_PARSE_OPTRECS
 * message to the slave nodes causing the slave nodes to reget the
 * directory entry containing the location of the optimized resync record.
 * After the record is reread from disk, then writeoptrecord is called
 * if the location of the optimized resync record or flags have changed.
 * When writeoptrecord is called, the node that is the owner of this record
 * will write the optimized record to the location specified in the directory
 * entry.  Since the master node uses the highest class message (PARSE)
 * the record owner node is guaranteed to already have an updated
 * directory entry incore.
 *
 * The other difference between the traditional/local set and MN diskset
 * is that the directory entry can be written to disk before the optimized
 * record in a MN diskset if the record is owned by a slave node.  So,
 * the users of an optimized record must handle the failure case when no
 * data is available from an optimized record since the master node could
 * have failed during the relocation of the optimized record to another mddb.
 */
static int
fixoptrecord(
	mddb_set_t	*s,
	mddb_de_ic_t	*dep,
	mddb_db_t	*dbp
)
{
	int		changed;
	int		writedata;
	int		err = 0;
	int		i;
	mddb_lb_t	*lbp;
	mddb_optinfo_t	*op;
	mddb_db32_t	*db32p;
	int		rec_owner;	/* Is node owner of record? */

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
#endif

	lbp = s->s_lbp;
	changed = 0;
	writedata = 0;
	for (i = 0; i < 2; i++) {
		op = &dep->de_optinfo[i];

		if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE))
			op->o_flags = 0;

		/*
		 * If optimized record has seen a replica failure,
		 * assign new replica to record and re-write data
		 * to new record.
		 */
		if (! (op->o_flags & MDDB_F_ACTIVE)) {
			getoptdev(s, dep, i);
			writedata++;
			changed++;
			/* Set flag for slaves to reread dep and write rec */
			if (lbp->lb_flags & MDDB_MNSET) {
				s->s_mn_parseflags |= MDDB_PARSE_OPTRECS;
			}
		}

		/*
		 * If just an error in the data was seen, set
		 * the optimized record's replica flag to active (ok)
		 * and try again.
		 */
		if (op->o_flags & MDDB_F_EDATA) {
			dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE;
			writedata++;
		}
	}

	rec_owner = 0;
	if (lbp->lb_flags & MDDB_MNSET) {
		/*
		 * If a MN diskset then check the owner of optimized record.
		 * If the master node owns the record or if there is
		 * no owner of the record, then the master can write the
		 * optimized record to disk.
		 * Master node can write the optimized record now, but
		 * slave nodes write their records during handling of
		 * the MDDB_PARSE_OPTRECS message.
		 */
		if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) ||
		    (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) {
			rec_owner = 1;
		}
	} else {
		/*
		 * In traditional diskset and local set, this node
		 * is always the record owner and always the master.
		 */
		rec_owner = 1;
	}

	/*
	 * If this node is the record owner, write out record.
	 */
	if ((writedata) && (rec_owner)) {
		if (err = writeoptrecord(s, dep)) {
			return (err);
		}
	}
	if (! changed)
		return (0);
	uniqtime32(&dbp->db_timestamp);
	dbp->db_revision = MDDB_REV_DB;
	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
	create_db32rec(db32p, dbp);
	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
	err = writeall(s, (caddr_t)db32p, db32p->db32_blknum,
	    1, MDDB_WR_ONLY_MASTER);
	kmem_free((caddr_t)db32p, MDDB_BSIZE);
	return (err);
}

static int
fixoptrecords(
	mddb_set_t		*s
)
{
	mddb_de_ic_t	*dep;
	mddb_db_t	*dbp;
	int		err = 0;
	set_t		setno;

	/*
	 * In a MN diskset, the master node is the only node that runs
	 * fixoptrecords.  If the master node changes anything, then the
	 * master node sends PARSE message to the slave nodes.  The slave
	 * nodes will then re-read in the locator block or re-read in the
	 * directory blocks and re-write the optimized resync records.
	 */
	setno = s->s_setno;
	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
	    (md_set[setno].s_am_i_master == 0)) {
		return (0);
	}

	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
			if (! (dep->de_flags & MDDB_F_OPT))
				continue;
			err = fixoptrecord(s, dep, dbp);
			if (err != 0)
				return (err);
		}
	}
	return (0);
}

/*
 * Checks incore version of mddb data to mddb data ondisk.
 *
 * Returns:
 *	- 0 if the data was successfully read and is good.
 *	- MDDB_F_EREAD if a read error occurred.
 *	- 1 if the data read is bad (checksum failed, etc)
 */
static int
checkcopy
(
	mddb_set_t	*s,
	int		li
)
{
	mddb_db_t	*dbp;
	mddb_db32_t	*cdb32p;
	mddb_de_ic_t	*dep;
	mddb_de32_t	*cde32p;
	mddb_rb32_t	*rbp, *crbp;
	size_t		size;
	int		i;
	int		retval = 1;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

	if (s->s_databuffer_size == 0) {
		size_t maxrecsize = MDDB_BSIZE;

		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
			for (dep = dbp->db_firstentry; dep; dep = dep->de_next)
				if (! (dep->de_flags & MDDB_F_OPT) &&
				    dep->de_recsize > maxrecsize)
					maxrecsize = dep->de_recsize;

		s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
		s->s_databuffer_size = maxrecsize;
	}

	cdb32p = (mddb_db32_t *)s->s_databuffer;

	/*
	 * first go through and make sure all directory stuff
	 * is the same
	 */
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) {
			retval = MDDB_F_EREAD;
			goto err;
		}
		if (cdb32p->db32_magic != MDDB_MAGIC_DB)
			goto err;
		if (revchk(MDDB_REV_DB, cdb32p->db32_revision))
			goto err;
		if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL))
			goto err;
		if (cdb32p->db32_nextblk != dbp->db_nextblk)
			goto err;
		if (cdb32p->db32_recsum != dbp->db_recsum)
			goto err;
		if (cdb32p->db32_firstentry) {
			cde32p = (mddb_de32_t *)
			    ((void *)((caddr_t)(&cdb32p->db32_firstentry)
			    + sizeof (cdb32p->db32_firstentry)));
		} else
			cde32p = NULL;

		dep = dbp->db_firstentry;
		/*
		 * check if all directory entries are identical
		 */
		while (dep && cde32p) {
			if (dep->de_recid != cde32p->de32_recid)
				goto err;
			if (dep->de_type1 != cde32p->de32_type1)
				goto err;
			if (dep->de_type2 != cde32p->de32_type2)
				goto err;
			if (dep->de_reqsize != cde32p->de32_reqsize)
				goto err;
			if (dep->de_flags != cde32p->de32_flags)
				goto err;

			for (i = 0; i < 2; i++) {
				if (dep->de_optinfo[i].o_li !=
				    cde32p->de32_optinfo[i].o_li)
					break;
			}
			if (i != 2)
				goto err;
			size = sizeof (mddb_block_t) * dep->de_blkcount;
			if (bcmp((caddr_t)dep->de_blks,
			    (caddr_t)cde32p->de32_blks, size))
				goto err;
			dep = dep->de_next;
			if (cde32p->de32_next)
				cde32p = nextentry(cde32p);
			else
				cde32p = NULL;
		}
		if (dep || cde32p)
			goto err;
	}
	/*
	 * If here, all directories are functionally identical
	 * check to make sure all records are identical
	 * the reason the records are not just bcmped is that the
	 * lock flag does not want to be compared.
	 */
	crbp = (mddb_rb32_t *)cdb32p;
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
			if ((dep->de_flags & MDDB_F_OPT) ||
			    (dep->de_flags & MDDB_F_CHANGELOG))
				continue;
			rbp = (mddb_rb32_t *)dep->de_rb;
			if (readblklst(s, (caddr_t)crbp, dep->de_blks,
			    dep->de_blkcount, li, 0)) {
				retval = MDDB_F_EREAD;
				goto err;
			}
			/* Check the crc for this record */
			if (rec_crcchk(s, dep, crbp))
				goto err;

			if (rbp->rb_checksum != crbp->rb_checksum ||
			    rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle)
				goto err;
		}
	}
	return (0);
err:
	return (retval);
}

/*
 * Determine if the location information for two mddbs is the same.
 * The device slice and block offset should match.  If both have devids then
 * use that for the comparison, otherwise we compare the dev_ts.
 * Comparing with the devid allows us to handle the case where a mddb was
 * relocated to a dead mddbs dev_t.  The live mddb will have the dev_t of
 * the dead mddb but the devid comparison will catch this and not match.
 *
 * Return 1 if the location of the two mddbs match, 0 if not.
 */
static int
match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev,
	daddr32_t blkno)
{
	if (rip->ri_flags & MDDB_F_EMASTER) {
		/*
		 * If this element is errored then we don't try to match on it.
		 * If we try to match we could erroneously match on the dev_t
		 * of a relocated disk.
		 */
		return (0);
	}

	if (rip->ri_devid && devid && minor) {
		/*
		 * If old devid exists, then this is a replicated diskset
		 * and both old and new devids must be checked.
		 */
		if (rip->ri_old_devid) {
			if (((ddi_devid_compare(rip->ri_devid, devid) != 0) &&
			    (ddi_devid_compare(rip->ri_old_devid,
			    devid) != 0)) ||
			    (strcmp(rip->ri_minor_name, minor) != 0))
				return (0);
		} else {
			if (ddi_devid_compare(rip->ri_devid, devid) != 0 ||
			    strcmp(rip->ri_minor_name, minor) != 0)
				return (0);
		}
	} else {
		if (rip->ri_dev != dev)
			return (0);
	}

	if (rip->ri_blkno != blkno)
		return (0);

	return (1);
}

static int
ridev(
	mddb_ri_t	**rip,
	mddb_cfg_loc_t	*clp,
	dev32_t		*dev_2b_fixed,
	int		flag)
{
	mddb_ri_t	*r, *r1;
	md_dev64_t	ldev, ndev;
	major_t		majordev;
	int		sz;

	if (MD_UPGRADE) {
		ldev = md_makedevice(md_targ_name_to_major(clp->l_driver),
		    clp->l_mnum);
	} else {
		if (ddi_name_to_major(clp->l_driver) == (major_t)-1)
			return (EINVAL);

		ldev = md_makedevice(ddi_name_to_major(clp->l_driver),
		    clp->l_mnum);
	}

	if (clp->l_devid != 0) {
		/*
		 * Get dev associated with device id and minor name.
		 * Setup correct driver name if dev is now different.
		 * Don't change driver name if during upgrade.
		 */
		ndev = ldev;
		if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid,
		    &ndev, clp->l_minor_name)) {
			if ((ndev != ldev) && (!(MD_UPGRADE))) {
				majordev = md_getmajor(ndev);
				(void) strcpy(clp->l_driver,
				    ddi_major_to_name(majordev));
				clp->l_mnum = md_getminor(ndev);
				clp->l_devid_flags |= MDDB_DEVID_VALID;
				ldev = ndev;
			}
		} else {
			/* Mark as invalid */
			clp->l_devid_flags &= ~MDDB_DEVID_VALID;
		}
	}

	clp->l_dev = md_cmpldev(ldev);
	if (dev_2b_fixed)
		*dev_2b_fixed = clp->l_dev;
	r = *rip;

	while (r) {
		if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid,
		    clp->l_minor_name, ldev, clp->l_blkno)) {
			if ((clp->l_devid != 0) &&
			    !(clp->l_devid_flags & MDDB_DEVID_VALID)) {
				r->ri_flags |= MDDB_F_EMASTER;
			} else {
				r->ri_flags |= flag;
			}
			return (0);	/* already entered return success */
		}
		r = r->ri_next;
	}

	/*
	 * This replica not represented in the current rip list,
	 * so add it to the list.
	 */
	r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP);
	r->ri_dev = ldev;
	r->ri_blkno = clp->l_blkno;
	(void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM);
	if (strlen(clp->l_driver) >= MD_MAXDRVNM) {
		r->ri_driver[(MD_MAXDRVNM -1)] = '\0';
	}
	if (clp->l_devname != NULL) {
		(void) strcpy(r->ri_devname, clp->l_devname);
	}
	r->ri_flags |= flag;
	if (clp->l_devid != 0) {
		sz = clp->l_devid_sz;
		r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP);
		bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz);

		if (clp->l_old_devid != NULL) {
			sz = clp->l_old_devid_sz;
			r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz,
			    KM_SLEEP);
			bcopy((char *)(uintptr_t)clp->l_old_devid,
			    (char *)r->ri_old_devid, sz);
		} else {
			r->ri_old_devid = 0;
		}
		if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX)
			(void) strcpy(r->ri_minor_name, clp->l_minor_name);

		if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) {
			/*
			 * Devid is present, but not valid.  This could
			 * happen if device has been powered off or if
			 * the device has been removed.  Mark the device in
			 * error.  Don't allow any writes to this device
			 * based on the dev_t since another device could
			 * have been placed in its spot and be responding to
			 * the dev_t accesses.
			 */
			r->ri_flags |= MDDB_F_EMASTER;
		}
	} else {
		r->ri_devid = 0;
		r->ri_old_devid = 0;
	}

	/*
	 * If the rip list is empty then this entry
	 * is the list.
	 */
	if (*rip == NULL) {
		*rip = r;
		return (0);
	}

	/*
	 * Add this entry to the end of the rip list
	 */
	r1 = *rip;
	while (r1->ri_next)
		r1 = r1->ri_next;
	r1->ri_next = r;
	return (0);
}

/*
 * writecopy writes the incore data blocks out to all of the replicas.
 * This is called from writestart
 *	- when a diskset is started or
 *	- when an error has been enountered during the write to a mddb.
 * and from newdev when a new mddb is being added.
 *
 * flag can be 2 values:
 *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
 *		always used for traditional and local disksets.
 *		For MN diskset:
 *			All nodes can call writecopy, but only the
 *			master node actually writes data to the disk
 *			except for optimized resync records.
 *			An optimized resync record can only be written to
 *			by the record owner.
 *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
 *		master has been chosen, the new master may need to
 * 		write its incore mddb to disk (this is the case where the
 *		old master had executed a message but hadn't relayed it
 *		to this slave yet).  New master should not write the
 *		change log records since new master would be overwriting
 *		valuable data.  Only used during a reconfig cycle.
 */
static int
writecopy(
	mddb_set_t	*s,
	int		li,
	int		flag
)
{
	mddb_db_t	*dbp;
	mddb_db32_t	*db32p;
	mddb_de_ic_t	*dep;
	mddb_rb32_t	*rbp;
	uint_t		checksum;
	int		err = 0;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
#endif

	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
		create_db32rec(db32p, dbp);
		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
		err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li,
		    MDDB_WR_ONLY_MASTER);
		kmem_free((caddr_t)db32p, MDDB_BSIZE);
		if (err)
			return (err);
		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
			/*
			 * In a multinode diskset, when a new master is
			 * chosen the new master may need to write its
			 * incore copy of the mddb to disk.  In this case,
			 * don't want to overwrite the change log records
			 * so new master sets flag to MDDB_WRITECOPY_SYNC.
			 */
			if (flag == MDDB_WRITECOPY_SYNC) {
				if (dep->de_flags & MDDB_F_CHANGELOG)
					continue;
			}
			/*
			 * In a multinode diskset, don't write out optimized
			 * resync resyncs since only the mirror owner node
			 * will have the correct data.  If writecopy is
			 * being called from writestart as a result of
			 * an mddb failure, then writestart will handle
			 * the optimized records when it calls fixoptrecords.
			 */
			if ((MD_MNSET_SETNO(s->s_setno)) &&
			    (dep->de_flags & MDDB_F_OPT)) {
				continue;
			}

			rbp = dep->de_rb;
			checksum = rbp->rb_checksum_fiddle;
			checksum ^= rbp->rb_checksum;
			/* Generate the crc for this record */
			rec_crcgen(s, dep, rbp);
			checksum ^= rbp->rb_checksum;
			rbp->rb_checksum_fiddle = checksum;
			if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
			    dep->de_blkcount, li, (mddb_bf_t **)0,
			    MDDB_WR_ONLY_MASTER))
				return (err);
		}
	}
	return (0);
}

static int
upd_med(
	mddb_set_t	*s,
	char		*tag
)
{
	med_data_t	meddb;
	int		medok;
	mddb_lb_t	*lbp = s->s_lbp;
	set_t		setno = s->s_setno;
	int		li;
	int		alc;
	int		lc;


	/* If no mediator hosts, nothing to do */
	if (s->s_med.n_cnt == 0)
		return (0);

	/*
	 * If this is a MN set and we are not the master, then don't
	 * update mediator hosts or mark mediator as golden since
	 * only master node should do that.
	 */
	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
	    (md_set[setno].s_am_i_master == 0)) {
		return (0);
	}

	bzero((char *)&meddb, sizeof (med_data_t));
	meddb.med_dat_mag = MED_DATA_MAGIC;
	meddb.med_dat_rev = MED_DATA_REV;
	meddb.med_dat_fl = 0;
	meddb.med_dat_sn = setno;
	meddb.med_dat_cc = lbp->lb_commitcnt;
	TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime);
	crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);

	/* count accessible mediators */
	medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);

	/* count accessible and existing replicas */
	for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) {
		mddb_locator_t	*lp = &lbp->lb_locators[li];

		if (lp->l_flags & MDDB_F_DELETED)
			continue;

		lc++;

		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
		    (lp->l_flags & MDDB_F_EMASTER) ||
		    (lp->l_flags & MDDB_F_EWRITE))
			continue;

		alc++;
	}

	/*
	 * Mediator update quorum is >= 50%: check for less than
	 * "mediator update" quorum.
	 */
	if ((medok * 2) < s->s_med.n_cnt) {
		/* panic if <= 50% of all replicas are accessible */
		if ((lc > 0) && ((alc * 2) <= lc)) {
			cmn_err(CE_PANIC,
			    "md: Update of 50%% of the mediator hosts failed");
			/* NOTREACHED */
		}

		cmn_err(CE_WARN,
		    "md: Update of 50%% of the mediator hosts failed");
	}

	/*
	 * If we have mediator update quorum and exactly 50% of the replicas
	 * are accessible then mark the mediator as golden.
	 */
	if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) &&
	    ((alc * 2) == lc)) {
		meddb.med_dat_fl = MED_DFL_GOLDEN;
		crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
		(void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
	}

	return (0);
}

static int
push_lb(mddb_set_t *s)
{
	mddb_lb_t	*lbp = s->s_lbp;

	/* push the change to all the replicas */
	uniqtime32(&lbp->lb_timestamp);
	if (MD_MNSET_SETNO(s->s_setno)) {
		lbp->lb_revision = MDDB_REV_MNLB;
	} else {
		lbp->lb_revision = MDDB_REV_LB;
	}
	/*
	 * The updates to the mediator hosts are done
	 * by the callers of this function.
	 */
	return (writelocall(s));
}

/* Should not call for MN diskset since data tags are not supported */
static int
dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp)
{
	int 		diff = 0;

	diff = (int)(odtp->dt_setno - ndtp->dt_setno);
	if (diff)
		return (diff);

	diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN);
	if (diff)
		return (diff);

	diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1);
	if (diff)
		return (diff);

	/*CSTYLED*/
	return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=));
}

/* Should not call for MN diskset since data tags are not supported */
static int
dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp)
{
	int		nextid = 0;
	mddb_dtag_lst_t **dtlpp = &s->s_dtlp;

	/* Run to the end of the list */
	for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) {
		if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0)
			return (0);
		nextid++;
	}

	/* Add the new member */
	*dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP);

	/* Update the dtag portion of the list */
	bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt),
	    sizeof (mddb_dtag_t));

	/* Fix up the id value */
	(*dtlpp)->dtl_dt.dt_id = ++nextid;

	return (0);
}

/*
 * Even though data tags are not supported in MN disksets, dt_cntl may
 * be called for a MN diskset since this routine is called even before
 * it is known the kind of diskset being read in from disk.
 * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned.
 */
static int
dtl_cntl(mddb_set_t *s)
{
	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
	int		ndt = 0;

	while (dtlp != NULL) {
		ndt++;
		dtlp = dtlp->dtl_nx;
	}

	return (ndt);
}

/*
 * Even though data tags are not supported in MN disksets, dt_cntl may
 * be called for a MN diskset since this routine is called even before
 * it is known the kind of diskset being read in from disk.
 * For a MNdiskset, s_dtlp is 0 so a 0 is returned.
 */
static mddb_dtag_t *
dtl_findl(mddb_set_t *s, int id)
{
	mddb_dtag_lst_t	*dtlp = s->s_dtlp;

	while (dtlp != NULL) {
		if (dtlp->dtl_dt.dt_id == id)
			return (&dtlp->dtl_dt);
		dtlp = dtlp->dtl_nx;
	}
	return ((mddb_dtag_t *)NULL);
}

/* Should not call for MN diskset since data tags are not supported */
static void
dtl_freel(mddb_dtag_lst_t **dtlpp)
{
	mddb_dtag_lst_t	*dtlp;
	mddb_dtag_lst_t	*tdtlp;


	for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) {
		dtlp = tdtlp->dtl_nx;
		kmem_free(tdtlp, sizeof (mddb_dtag_lst_t));
	}
	*dtlpp = (mddb_dtag_lst_t *)NULL;
}

/*
 * Even though data tags are not supported in MN disksets, dt_setup will
 * be called for a MN diskset since this routine is called even before
 * it is known the kind of diskset being read in from disk.
 * Once this set is known as a MN diskset, the dtp area will be freed.
 */
static void
dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp)
{
	mddb_dt_t	*dtp;
	set_t		setno = s->s_setno;


	if (md_set[setno].s_dtp == (mddb_dt_t *)NULL)
		md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
	else if (dtagp == (mddb_dtag_t *)NULL)
		bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);

	/* shorthand */
	dtp = (mddb_dt_t *)md_set[setno].s_dtp;

	dtp->dt_mag = MDDB_MAGIC_DT;
	dtp->dt_rev = MDDB_REV_DT;

	if (dtagp != NULL)
		dtp->dt_dtag = *dtagp;		/* structure assignment */

	/* Initialize the setno */
	dtp->dt_dtag.dt_setno = setno;

	/* Clear the id and flags, this is only used in user land */
	dtp->dt_dtag.dt_id = 0;

	/* Checksum it */
	crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
}

/* Should not call for MN diskset since data tags are not supported */
static int
set_dtag(mddb_set_t *s, md_error_t *ep)
{
	mddb_lb_t	*lbp = s->s_lbp;
	mddb_dtag_t	tag;

	if (lbp->lb_dtblkcnt == 0) {
		/* Data tags not used in a MN set - so no failure returned */
		if (lbp->lb_flags & MDDB_MNSET)
			return (0);

		cmn_err(CE_WARN,
		    "No tag record allocated, unable to tag data");
		(void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno);
		return (1);
	}

	/* Clear the stack variable */
	bzero((caddr_t)&tag, sizeof (mddb_dtag_t));

	/* Get the HW serial number for this host */
	(void) snprintf(tag.dt_sn, MDDB_SN_LEN, "%u", zone_get_hostid(NULL));
	tag.dt_sn[MDDB_SN_LEN - 1] = '\0';

	/* Get the nodename that this host goes by */
	(void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME);
	tag.dt_hn[MD_MAX_NODENAME] = '\0';

	/* Get a time stamp for NOW */
	uniqtime32(&tag.dt_tv);

	/* Setup the data tag record */
	dt_setup(s, &tag);

	/* Free any list of tags if they exist */
	dtl_freel(&s->s_dtlp);

	/* Put the new tag onto the tag list */
	(void) dtl_addl(s, &tag);

	return (0);
}

/*
 * If called during upgrade, this routine expects a non-translated
 * (aka target) dev.
 * Should not call for MN diskset since data tags are not supported.
 */
static int
dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip)
{
	int		err = 0;
	md_dev64_t	dev;
	caddr_t		tbuf;
	daddr_t		physblk;
	mddb_block_t	blk;
	mddb_dt_t	*dtp;
	mddb_dtag_t	*dtagp;
	set_t		setno = s->s_setno;

	/* If have not allocated a data tag record, there is nothing to do */
	if (lbp->lb_dtblkcnt == 0)
		return (1);

	dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);

	if (dtp == (mddb_dt_t *)NULL)
		return (1);

	/* shorthand */
	dev = md_xlate_targ_2_mini(rip->ri_dev);
	if (dev == NODEV64) {
		return (1);
	}

	tbuf = (caddr_t)rip->ri_dtp;

	for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) {
		physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip);
		err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0);
		/* error reading the tag */
		if (err) {
			err = 1;
			goto out;
		}
		tbuf += MDDB_BSIZE;
	}

	/* magic is valid? */
	if (dtp->dt_mag != MDDB_MAGIC_DT) {
		err = 1;
		goto out;
	}

	/* revision is valid? */
	if (revchk(MDDB_REV_DT, dtp->dt_rev)) {
		err = 1;
		goto out;
	}

	/* crc is valid? */
	if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) {
		err = 1;
		goto out;
	}

	/* shorthand */
	dtagp = &dtp->dt_dtag;

	/* set number match? */
	if (dtagp->dt_setno != setno) {
		err = 1;
		goto out;
	}

	/* tag is not empty? */
	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
	    dtagp->dt_id == 0) {
		err = 2;
		goto out;
	}

	/* Mark the locator as having tagged data */
	rip->ri_flags |= MDDB_F_TAGDATA;

out:
	if (err) {
		if (err == 1) {
			md_set_setstatus(setno, MD_SET_BADTAG);
			rip->ri_flags |= MDDB_F_BADTAG;
		}
		if (dtp != NULL) {
			kmem_free(dtp, MDDB_DT_BYTES);
			rip->ri_dtp = (mddb_dt_t *)NULL;
		}
	}

	return (err);
}

/* Should not call for MN diskset since data tags are not supported */
static int
dt_write(mddb_set_t *s)
{
	int		li;
	int		err = 0;
	int		werr;
	int		empty_tag = 0;
	mddb_dtag_t	*dtagp;
	mddb_dt_t	*dtp;
	mddb_lb_t	*lbp = s->s_lbp;
	set_t		setno = s->s_setno;
	uint_t		set_status = md_get_setstatus(setno);


	ASSERT(md_set[setno].s_dtp != NULL);

	/* Nowhere to write to */
	if (lbp->lb_dtblkcnt == 0)
		return (err);

	if (set_status & MD_SET_BADTAG)
		return (err);

	/* shorthand */
	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
	dtagp = &dtp->dt_dtag;

	/* See if the tag is empty. */
	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
	    dtagp->dt_id == 0)
		empty_tag = 1;

	/* Write the tag to the locators and reset appropriate flags. */
	for (li = 0; li < lbp->lb_loccnt; li++) {
		mddb_locator_t	*lp = &lbp->lb_locators[li];

		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
		    (lp->l_flags & MDDB_F_DELETED) ||
		    (lp->l_flags & MDDB_F_EWRITE))
			continue;

		werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk,
		    MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER);

		if (werr) {
			err |= werr;
			continue;
		}

		if (empty_tag)
			lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA);
		else {
			lp->l_flags |= MDDB_F_TAGDATA;
			lp->l_flags &= ~MDDB_F_BADTAG;
		}
	}

	if (err)
		return (err);


	/* If the tags were written, check to see if any tags remain. */
	for (li = 0; li < lbp->lb_loccnt; li++) {
		mddb_locator_t	*lp = &lbp->lb_locators[li];

		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
		    (lp->l_flags & MDDB_F_DELETED) ||
		    (lp->l_flags & MDDB_F_EWRITE))
			continue;

		if (lp->l_flags & MDDB_F_TAGDATA)
			break;
	}

	/* If there are no tags, then clear CLRTAG and TAGDATA */
	if (li == lbp->lb_loccnt) {
		md_clr_setstatus(setno, MD_SET_CLRTAG);
		md_clr_setstatus(setno, MD_SET_TAGDATA);
	}

	return (err);
}

/* Should not call for MN diskset since data tags are not supported */
static int
dt_alloc_if_needed(mddb_set_t *s)
{
	int		i;
	int		li;
	int		moveit = 0;
	mddb_lb_t	*lbp = s->s_lbp;
	mddb_block_t	blkcnt = lbp->lb_dtblkcnt;
	set_t		setno = s->s_setno;
	uint_t		set_status = md_get_setstatus(setno);

	/*
	 * If the data tag record is allocated (blkcnt != 0) and a bad tag was
	 * not detected, there is nothing to do.
	 */
	if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG))
		return (0);

	/* Bitmap not setup, checks can't be done */
	if (s->s_totalblkcnt == 0)
		return (0);

	/* While reading the tag(s) an invalid tag data record was seen */
	if (set_status & MD_SET_BADTAG)
		/* See if the invalid tag needs to be moved */
		for (i = 0; i < MDDB_DT_BLOCKS; i++)
			if (blkcheck(s, (i + lbp->lb_dtfirstblk))) {
				moveit = 1;
				break;
			}

	/* Need to move or allocate the tag data record */
	if (moveit || blkcnt == 0) {
		lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS);
		if (lbp->lb_dtfirstblk == 0) {
			cmn_err(CE_WARN,
			    "Unable to allocate data tag record");
			return (0);
		}
		lbp->lb_dtblkcnt = MDDB_DT_BLOCKS;

		/* Mark the locators so that they get written to disk. */
		for (li = 0; li < lbp->lb_loccnt; li++) {
			mddb_locator_t	*lp = &lbp->lb_locators[li];

			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
			    (lp->l_flags & MDDB_F_DELETED) ||
			    (lp->l_flags & MDDB_F_EWRITE))
				continue;

			lp->l_flags |= MDDB_F_BADTAG;
		}
		return (1);
	}

	/*
	 * Make sure the blocks are owned, since the calculation in
	 * computefreeblks() is bypassed when MD_SET_BADTAG is set.
	 */
	for (i = 0; i < MDDB_DT_BLOCKS; i++)
		blkbusy(s, (i + lbp->lb_dtfirstblk));

	return (1);
}

/*
 * Writestart writes the incore mddb out to all of the replicas.
 * This is called when a diskset is started and when an error has
 * been enountered during the write to a mddb.
 *
 * flag can be 2 values:
 *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
 *		always used for traditional and local disksets.
 *		This is the normal path for MN disksets since the slave
 *		nodes aren't actually allowed to write to disk.
 *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
 *		master has been chosen, the new master may need to
 * 		write its incore mddb to disk (this is the case where the
 *		old master had executed a message but hadn't relayed it
 *		to this slave yet).  New master should not write the
 *		change log records since new master would be overwriting
 *		valuable data.  Only used during a reconfig cycle.
 */
static int
writestart(
	mddb_set_t	*s,
	int		flag
)
{
	int		li;
	mddb_locator_t	*lp;
	mddb_lb_t	*lbp;
	mddb_ln_t	*lnp;
	int		err = 0;
	uint_t		set_status;

	lbp = s->s_lbp;

	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];
		if (! (lp->l_flags & MDDB_F_ACTIVE))
			continue;
		if (! (lp->l_flags & MDDB_F_SUSPECT))
			continue;
		if (writecopy(s, li, flag))
			return (1);
		lp->l_flags |= MDDB_F_UP2DATE;
	}

	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];
		if (! (lp->l_flags & MDDB_F_ACTIVE))
			continue;
		if ((lp->l_flags & MDDB_F_UP2DATE))
			continue;
		if (checkcopy(s, li))
			if (err = writecopy(s, li, flag))
				return (1);
		lp->l_flags |= MDDB_F_UP2DATE;
	}

	/*
	 * Call fixoptrecord even during a reconfig cycle since a replica
	 * failure may force the master to re-assign the optimized
	 * resync record to another replica.
	 */
	if (fixoptrecords(s))
		return (1);

	set_status = md_get_setstatus(s->s_setno);

	/* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */
	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];

		if (lp->l_flags & MDDB_F_DELETED)
			continue;

		if (((lp->l_flags & MDDB_F_ACTIVE) != 0 &&
		    (lp->l_flags & MDDB_F_OLDACT) == 0) ||
		    ((lp->l_flags & MDDB_F_ACTIVE) == 0 &&
		    (lp->l_flags & MDDB_F_OLDACT) != 0))
			break;

		if ((set_status & MD_SET_TAGDATA) ||
		    (set_status & MD_SET_CLRTAG))
			if ((lp->l_flags & MDDB_F_TAGDATA) ||
			    (lp->l_flags & MDDB_F_BADTAG))
				break;
	}

	/*
	 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT)
	 * the lbp identifier and the set identifier doesn't match.
	 */
	if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) {

		/* Only call for traditional and local sets */
		if (!(lbp->lb_flags & MDDB_MNSET))
			(void) dt_write(s);

		setidentifier(s, &lbp->lb_ident);

		if (err = push_lb(s)) {
			(void) upd_med(s, "writestart(0)");
			return (err);
		}

		(void) upd_med(s, "writestart(0)");

		if (err = push_lb(s)) {
			(void) upd_med(s, "writestart(1)");
			return (err);
		}

		(void) upd_med(s, "writestart(1)");

		lnp = s->s_lnp;
		uniqtime32(&lnp->ln_timestamp);
		if (lbp->lb_flags & MDDB_MNSET)
			lnp->ln_revision = MDDB_REV_MNLN;
		else
			lnp->ln_revision = MDDB_REV_LN;
		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
		err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
		    lbp->lb_lnblkcnt, 0);
		/*
		 * If a MN diskset and this is the master, set the PARSE_LOCNM
		 * flag in the mddb_set structure to show that the locator
		 * names have changed.
		 * Don't set parseflags as a result of a new master sync
		 * during reconfig cycle since slaves nodes are already
		 * in-sync with the new master.
		 */

		if ((lbp->lb_flags & MDDB_MNSET) &&
		    (md_set[s->s_setno].s_am_i_master) &&
		    (flag != MDDB_WRITECOPY_SYNC)) {
			s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
		}

		if (err)
			return (err);
	}

	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];
		if (lp->l_flags & MDDB_F_DELETED)
			continue;
		if (lp->l_flags & MDDB_F_ACTIVE) {
			lp->l_flags |= MDDB_F_OLDACT;
		} else {
			lp->l_flags &= ~MDDB_F_OLDACT;
		}
	}

	md_clr_setstatus(s->s_setno, MD_SET_STALE);

	return (0);
}

/*
 * selectreplicas selects the working replicas and may write the incore
 * version of the mddb out to the replicas ondisk.
 *
 * flag can be 3 values:
 *	MDDB_RETRYSCAN - quick scan to see if there is an error.
 *			If no new error, returns without writing mddb
 *			to disks.  If a new error is seen, writes out
 *			mddb to disks.
 *	MDDB_SCANALL  - lengthy scan to check out mddbs and always writes
 *			out mddb to the replica ondisk.  Calls writecopy
 *			with MDDB_WRITECOPY_ALL flag which writes out
 *			all records to the replicas ondisk.
 *	MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore
 *			and ondisk mddbs by writing incore values to disk.
 *			Calls writecopy with MDDB_WRITECOPY_SYNC flag so
 *			that change log records are not written out.
 *			Only used by MN disksets.
 *
 * Returns:
 *	0 - Successful
 *	1 - Unable to write incore mddb data to disk since < 50% replicas.
 */
int
selectreplicas(
	mddb_set_t	*s,
	int		flag
)
{
	int		li;
	int		alc;
	int		lc;
	mddb_locator_t	*lp;
	mddb_lb_t	*lbp = s->s_lbp;
	set_t		setno = s->s_setno;
	int		wc_flag;

	/*
	 * can never transition from stale to not stale
	 */
	if (md_get_setstatus(setno) & MD_SET_STALE) {
		for (li = 0; li < lbp->lb_loccnt; li++) {
			lp = &lbp->lb_locators[li];
			if (lp->l_flags & MDDB_F_DELETED)
				continue;
			if (! (lp->l_flags & MDDB_F_EMASTER)) {
				lp->l_flags |= MDDB_F_ACTIVE;
			} else {
				lp->l_flags &= ~MDDB_F_ACTIVE;
			}
		}
		return (1);
	}

	if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) {
		for (li = 0; li < lbp->lb_loccnt; li++) {
			lp = &lbp->lb_locators[li];
			if (lp->l_flags & MDDB_F_DELETED)
				continue;
			if (lp->l_flags & MDDB_F_ACTIVE) {
				lp->l_flags |= MDDB_F_OLDACT;
				lp->l_flags &= ~MDDB_F_SUSPECT;
			} else {
				lp->l_flags |= MDDB_F_SUSPECT;
				lp->l_flags &= ~MDDB_F_OLDACT;
			}

			if (! (lp->l_flags & MDDB_F_EMASTER)) {
				lp->l_flags |= MDDB_F_ACTIVE;
				lp->l_flags &= ~MDDB_F_EWRITE;
				lp->l_flags &= ~MDDB_F_TOOSMALL;
			} else {
				lp->l_flags &= ~MDDB_F_ACTIVE;
			}
		}
		computefreeblks(s); /* set up free block bits */
	} else {
		for (li = 0; li < lbp->lb_loccnt; li++) {
			lp = &lbp->lb_locators[li];
			if (! (lp->l_flags & MDDB_F_ACTIVE))
				continue;
			if (lp->l_flags & MDDB_F_EWRITE)
				break;
		}

		/*
		 * if there are no errors this is error has already
		 * been processed return current state
		 */
		if (li == lbp->lb_loccnt)
			return (md_get_setstatus(setno) & MD_SET_TOOFEW);

		lp->l_flags &= ~MDDB_F_ACTIVE;
		do {
			lp = &lbp->lb_locators[li];
			lp->l_flags &= ~MDDB_F_UP2DATE;
		} while (++li < lbp->lb_loccnt);
	}

	alc = 0;
	lc = 0;
	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];
		if (lp->l_flags & MDDB_F_DELETED)
			continue;
		lc++;
		if (! (lp->l_flags & MDDB_F_ACTIVE))
			continue;
		alc++;
	}

	if (alc < ((lc + 1) / 2)) {
		md_set_setstatus(setno, MD_SET_TOOFEW);
		return (1);
	}

	/* Set wc_flag based on flag passed in. */
	if (flag == MDDB_SCANALLSYNC)
		wc_flag = MDDB_WRITECOPY_SYNC;
	else
		wc_flag = MDDB_WRITECOPY_ALL;

	do {
		if (! writestart(s, wc_flag)) {
			md_clr_setstatus(setno, MD_SET_TOOFEW);
			return (0);
		}
		alc  = 0;
		for (li = 0; li < lbp->lb_loccnt; li++) {
			lp = &lbp->lb_locators[li];
			if ((lp->l_flags & MDDB_F_DELETED) ||
			    (lp->l_flags & MDDB_F_EMASTER))
				continue;

			if (lp->l_flags & MDDB_F_EWRITE) {
				lp->l_flags &= ~MDDB_F_ACTIVE;
				lp->l_flags &= ~MDDB_F_UP2DATE;
				continue;
			}
			alc++;
		}
	} while (alc >= ((lc + 1) / 2));
	md_set_setstatus(setno, MD_SET_TOOFEW);
	return (1);
}

static int
checkstate(
	mddb_set_t	*s,
	int		probe
)
{
	int		error;
	uint_t		set_status = md_get_setstatus(s->s_setno);

	ASSERT(s != NULL);

	if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW))
		return (0);

	if (probe == MDDB_NOPROBE)
		return (1);

	single_thread_start(s);
	error = selectreplicas(s, MDDB_SCANALL);
	single_thread_end(s);

	if (error == 0 && s->s_zombie != 0) {
		mutex_exit(SETMUTEX(s->s_setno));
		error = mddb_deleterec(s->s_zombie);
		mutex_enter(SETMUTEX(s->s_setno));
		if (error == 0)
			s->s_zombie = 0;
	}
	return (error);
}

static int
writeretry(
	mddb_set_t	*s
)
{
	if (selectreplicas(s, MDDB_RETRYSCAN))
		if (selectreplicas(s, MDDB_SCANALL))
			return (1);
	return (0);
}

static void
free_mbipp(mddb_mb_ic_t **mbipp)
{
	mddb_mb_ic_t	*mbip1, *mbip2;

	for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) {
		mbip2 = mbip1->mbi_next;
		kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE);
	}
	*mbipp = (mddb_mb_ic_t *)NULL;
}

static mddb_ri_t *
save_rip(mddb_set_t *s)
{
	mddb_ri_t	*trip = s->s_rip;
	mddb_ri_t	*nrip = NULL;
	mddb_ri_t	**nripp = &nrip;
	mddb_ri_t	*rip;

	while (trip) {
		/* Run to the end of the list */
		for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next)
			/* void */;

		/* Add the new member */
		*nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP);

		ASSERT(*nripp != NULL);

		/* shorthand */
		rip = *nripp;

		*rip = *trip;			/* structure assignment */

		/* Clear the stuff that is not needed for hints */
		rip->ri_flags = 0;
		rip->ri_commitcnt = 0;
		rip->ri_transplant = 0;
		rip->ri_mbip = (mddb_mb_ic_t *)NULL;
		rip->ri_dtp = (mddb_dt_t *)NULL;
		rip->ri_lbp = (mddb_lb_t *)NULL;
		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
		rip->ri_devid = (ddi_devid_t)NULL;
		rip->ri_old_devid = (ddi_devid_t)NULL;
		rip->ri_next = (mddb_ri_t *)NULL;

		trip = trip->ri_next;
	}
	return (nrip);
}

static void
free_rip(mddb_ri_t **ripp)
{
	mddb_ri_t	*rip;
	mddb_ri_t	*arip;

	for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) {
		arip = rip->ri_next;
		if (rip->ri_devid != (ddi_devid_t)NULL) {
			ddi_devid_free(rip->ri_devid);
			rip->ri_devid = (ddi_devid_t)NULL;
		}
		if (rip->ri_old_devid != (ddi_devid_t)NULL) {
			ddi_devid_free(rip->ri_old_devid);
			rip->ri_old_devid = (ddi_devid_t)NULL;
		}
		kmem_free((caddr_t)rip, sizeof (*rip));
	}
	*ripp = (mddb_ri_t *)NULL;
}

/*
 * this routine selects the correct replica to use
 * the rules are as follows
 *	1.	if all replica has same init time select highest commit count
 *	2.	if some but not all replicas are from another hostid discard
 *		them.
 *	3.	find which init time is present is most replicas
 *	4.	discard all replicas which do not match most init times
 *	5.	select replica with highest commit count
 */

static mddb_lb_t *
selectlocator(
	mddb_set_t	*s
)
{
	mddb_ri_t	*rip = s->s_rip;
	mddb_ri_t	*r, *r1;
	mddb_lb_t	*lbp;
	struct timeval32 *tp = (struct timeval32 *)NULL;
	int		different;
	int		same;
	int		count;
	int		maxcount;
	set_t		setno = s->s_setno;
	size_t		sz;
	int		mn_set = 0;

	/* Clear the ri_transplant flag on all the rip entries. */
	/* Set ri_commitcnt to locator's commitcnt - if available */
	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
		r->ri_transplant = 0;
		if (r->ri_lbp != (mddb_lb_t *)NULL) {
			r->ri_commitcnt = r->ri_lbp->lb_commitcnt;
			/* If any locators have MN bit set, set flag */
			if (r->ri_lbp->lb_flags & MDDB_MNSET)
				mn_set = 1;
		}
	}

	/*
	 * A data tag is being used, so use it to limit the selection first.
	 * Data tags not used in MN diskset.
	 */
	if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) {
		mddb_dt_t	*dtp = (mddb_dt_t *)md_set[setno].s_dtp;

		/*
		 * now toss any locators that have a different data tag
		 */
		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
			if (r->ri_lbp == (mddb_lb_t *)NULL)
				continue;

			if (r->ri_dtp != (mddb_dt_t *)NULL) {
				/* If same tag, keep it */
				if (dtl_cmp(&dtp->dt_dtag,
				    &r->ri_dtp->dt_dtag) == 0)
					continue;
			}

			if (r->ri_dtp != (mddb_dt_t *)NULL) {
				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
				r->ri_dtp = (mddb_dt_t *)NULL;
			}

			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
			if (!(md_get_setstatus(setno) &
			    MD_SET_REPLICATED_IMPORT)) {
				if (r->ri_old_devid != (ddi_devid_t)NULL) {
					sz = ddi_devid_sizeof(r->ri_old_devid);
					kmem_free((caddr_t)r->ri_old_devid, sz);
					r->ri_old_devid = (ddi_devid_t)NULL;
				}
			}

			kmem_free((caddr_t)r->ri_lbp,
			    dbtob(r->ri_lbp->lb_blkcnt));
			r->ri_lbp = (mddb_lb_t *)NULL;

			r->ri_transplant = 1;
		}

		/* Tag used, clear the bit */
		md_clr_setstatus(s->s_setno, MD_SET_USETAG);

		if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) {
			/*
			 * Get rid of the list of tags.
			 */
			dtl_freel(&s->s_dtlp);

			/*
			 * Re-create the list with the tag used.
			 */
			(void) dtl_addl(s, &dtp->dt_dtag);
		}
	}

	/*
	 * scan to see if all replicas have same time
	 */
	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
		if (r->ri_lbp == (mddb_lb_t *)NULL)
			continue;
		if (tp == NULL) {
			tp = &r->ri_lbp->lb_inittime;
			continue;
		}
		/* CSTYLED */
		if (timercmp(tp, &r->ri_lbp->lb_inittime, !=))
			break;
	}

	/*
	 * if r == NULL then they were all them same. Choose highest
	 * commit count
	 */
	if (r == (mddb_ri_t *)NULL)
		goto out;

	/*
	 * If here, a bogus replica is present and at least 1 lb_inittime
	 * did not match.
	 */

	/*
	 * look and see if any but not all are from different id
	 */

	different = 0;
	same = 0;
	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
		if (r->ri_lbp == (mddb_lb_t *)NULL)
			continue;
		if (cmpidentifier(s, &r->ri_lbp->lb_ident))
			different = 1;
		else
			same = 1;
	}

	/*
	 * now go through and throw out different if there are some
	 * that are the same
	 */
	if (different != 0 && same != 0) {
		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
			if (r->ri_lbp == (mddb_lb_t *)NULL)
				continue;

			if (!cmpidentifier(s, &r->ri_lbp->lb_ident))
				continue;

			if (r->ri_dtp != (mddb_dt_t *)NULL) {
				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
				r->ri_dtp = (mddb_dt_t *)NULL;
			}

			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
			if (!(md_get_setstatus(setno) &
			    MD_SET_REPLICATED_IMPORT)) {
				if (r->ri_old_devid != (ddi_devid_t)NULL) {
					sz = ddi_devid_sizeof(r->ri_old_devid);
					kmem_free((caddr_t)r->ri_old_devid, sz);
					r->ri_old_devid = (ddi_devid_t)NULL;
				}
			}

			kmem_free((caddr_t)r->ri_lbp,
			    dbtob(r->ri_lbp->lb_blkcnt));
			r->ri_lbp = (mddb_lb_t *)NULL;

			r->ri_transplant = 1;
		}
	}

	/*
	 * go through and pick highest. Use n square because it is
	 * simple and 40 some is max possible
	 */
	maxcount = 0;
	lbp = (mddb_lb_t *)NULL;
	for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) {
		if (r1->ri_lbp == (mddb_lb_t *)NULL)
			continue;
		count = 0;
		for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) {
			if (r->ri_lbp == (mddb_lb_t *)NULL)
				continue;
			if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */
			    &r->ri_lbp->lb_inittime, ==))
				count++;
		}
		if (count > maxcount) {
			maxcount = count;
			lbp = r1->ri_lbp;
		}
	}

	/*
	 * now go though and toss any that are of a different time stamp
	 */
	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
		if (r->ri_lbp == (mddb_lb_t *)NULL)
			continue;
		if (timercmp(&lbp->lb_inittime, /* CSTYLED */
		    &r->ri_lbp->lb_inittime, ==))
			continue;

		if (r->ri_dtp != (mddb_dt_t *)NULL) {
			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
			r->ri_dtp = (mddb_dt_t *)NULL;
		}

		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
			if (r->ri_old_devid != (ddi_devid_t)NULL) {
				sz = ddi_devid_sizeof(r->ri_old_devid);
				kmem_free((caddr_t)r->ri_old_devid, sz);
				r->ri_old_devid = (ddi_devid_t)NULL;
			}
		}

		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
		r->ri_lbp = (mddb_lb_t *)NULL;

		r->ri_transplant = 1;
	}

out:
	/*
	 * Find the locator with the highest commit count, and make it the
	 * "chosen" one.
	 */
	lbp = (mddb_lb_t *)NULL;
	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
		if (r->ri_lbp == (mddb_lb_t *)NULL)
			continue;

		if (lbp == NULL) {
			lbp = r->ri_lbp;
			continue;
		}

		if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt)
			lbp = r->ri_lbp;
	}

	/* Toss all locator blocks, except the "chosen" one. */
	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
		if (r->ri_lbp == (mddb_lb_t *)NULL)
			continue;

		/* Get rid of all dtp's */
		if (r->ri_dtp != (mddb_dt_t *)NULL) {
			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
			r->ri_dtp = (mddb_dt_t *)NULL;
		}

		if (r->ri_lbp == lbp)
			continue;

		/* Get rid of extra locator devid block info */
		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
			if (r->ri_old_devid != (ddi_devid_t)NULL) {
				sz = ddi_devid_sizeof(r->ri_old_devid);
				kmem_free((caddr_t)r->ri_old_devid, sz);
				r->ri_old_devid = (ddi_devid_t)NULL;
			}
		}

		/* Get rid of extra locators */
		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
		r->ri_lbp = (mddb_lb_t *)NULL;
	}
	return (lbp);
}

static void
locator2cfgloc(
	mddb_lb_t		*lbp,
	mddb_cfg_loc_t		*clp,
	int			li,
	side_t			sideno,
	mddb_did_ic_t		*did_icp
)
{
	mddb_drvnm_t		*dn;
	mddb_locator_t		*lp = &lbp->lb_locators[li];
	mddb_sidelocator_t	*slp;
	mddb_mnsidelocator_t	*mnslp;
	mddb_did_info_t		*did_info;
	int 			i, sz, szalloc;
	int			mn_set = 0;
	mddb_mnlb_t		*mnlbp;

	if (lbp->lb_flags & MDDB_MNSET) {
		mn_set = 1;
		mnlbp = (mddb_mnlb_t *)lbp;
		for (i = 0; i < MD_MNMAXSIDES; i++) {
			mnslp = &mnlbp->lb_mnsidelocators[i][li];
			if (mnslp->mnl_sideno == sideno)
				break;
		}
		if (i == MD_MNMAXSIDES)
			return;
	} else {
		slp = &lbp->lb_sidelocators[sideno][li];
	}

	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
		did_info = &(did_icp->did_ic_blkp->blk_info[li]);
		if (did_info->info_flags & MDDB_DID_EXISTS) {
			sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
			if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
				/*
				 * copy device id from mddb to
				 * cfg_loc structure
				 */
				szalloc = clp->l_devid_sz;
				if (sz <= szalloc) {
					for (i = 0; i < sz; i++) {
						((char *)(uintptr_t)
						    clp->l_devid)[i] =
						    ((char *)did_icp->
						    did_ic_devid[li])[i];
					}
					clp->l_devid_flags |= MDDB_DEVID_VALID;
					(void) strcpy(clp->l_minor_name,
					    did_info->info_minor_name);
				} else {
					clp->l_devid_flags |=
					    MDDB_DEVID_NOSPACE;
				}
			} else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
				clp->l_devid_flags = MDDB_DEVID_SZ;
				clp->l_devid_sz = sz;
			}
		}
	}

	/*
	 * Even if a devid exists, use the dev, drvnm and mnum in the locators
	 * and sidelocators.  During startup, the dev, drvnm and mnum in
	 * these structures may not match the devid (the locators and
	 * sidelocators will be updated to match the devid by the routine
	 * load_old_replicas).  Using out-of-sync values won't cause any
	 * problems since ridev will re-derive these from the devid and mnum.
	 * After startup, the dev, drvnm and mnum in these structures have
	 * been updated and can be used.
	 */

	clp->l_blkno = lp->l_blkno;
	clp->l_flags = lp->l_flags;
	clp->l_dev = lp->l_dev;

	if (mn_set) {
		dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
		clp->l_mnum = mnslp->mnl_mnum;
	} else {
		dn = &lbp->lb_drvnm[slp->l_drvnm_index];
		clp->l_mnum = slp->l_mnum;
	}
	(void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM);
}

/*
 * Find the index into the mnsidelocator where entry will go.
 * Then index can be fed into both splitname2locatorblocks and
 * cfgloc2locator so that those entries can be kept in sync.
 *
 * Returns:
 *	-1 if failed to find unused slot or if a traditional diskset
 *	index, if successful  (0 <= index <= MD_MNMAXSIDES)
 */
static int
checklocator(
	mddb_lb_t		*lbp,
	int			li,
	side_t			sideno
)
{
	uchar_t			i;
	mddb_mnsidelocator_t	*mnslp;
	mddb_mnlb_t		*mnlbp;
	int			index = -1;

	if (lbp->lb_flags & MDDB_MNSET) {
		/*
		 * Checking side locator structure.  First, check if
		 * there is already an entry for this side.  If so,
		 * then use that entry.  Otherwise, find an entry
		 * that has a sideno of 0.
		 */
		mnlbp = (mddb_mnlb_t *)lbp;
		for (i = 0; i < MD_MNMAXSIDES; i++) {
			mnslp = &mnlbp->lb_mnsidelocators[i][li];
			if (mnslp->mnl_sideno == sideno) {
				/* Found a match - stop looking */
				index = i;
				break;
			} else if ((mnslp->mnl_sideno == 0) && (index == -1)) {
				/* Set first empty slot, but keep looking */
				index = i;
			}
		}
		/* Didn't find empty slot or previously used slot */
		if ((i == MD_MNMAXSIDES) && (index == -1)) {
			return (-1);
		}
		return (index);
	} else
		return (0);
}

/*
 * Takes locator information (driver name, minor number, sideno) and
 * stores it in the locator block.
 * For traditional diskset, the sideno is the index into the sidelocator
 * array in the locator block.
 * For the MN diskset, the sideno is the nodeid which can be any number,
 * so the index passed in is the index into the mnsidelocator array
 * in the locator block.
 */
static int
cfgloc2locator(
	mddb_lb_t		*lbp,
	mddb_cfg_loc_t		*clp,
	int			li,
	side_t			sideno,
	int			index	/* Only useful in MNsets when > 1 */
)
{
	uchar_t			i;
	mddb_sidelocator_t	*slp;
	mddb_mnsidelocator_t	*mnslp;
	mddb_set_t		*s;
	int			mn_set = 0;
	mddb_mnlb_t		*mnlbp;

	if (lbp->lb_flags & MDDB_MNSET) {
		mnlbp = (mddb_mnlb_t *)lbp;
		mn_set = 1;
		/*
		 * Index will be the slot that has the given sideno or
		 * the first empty slot if no match is found.
		 * This was pre-checked out in check locator.
		 */
		mnslp = &mnlbp->lb_mnsidelocators[index][li];
	} else {
		slp = &lbp->lb_sidelocators[sideno][li];
	}

	/*
	 * Look for the driver name
	 */
	for (i = 0; i < MDDB_DRVNMCNT; i++) {
		if (lbp->lb_drvnm[i].dn_len == 0)
			continue;
		if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver,
		    MD_MAXDRVNM) == 0)
			break;
	}

	/*
	 * Didn't find one, add a new one
	 */
	if (i == MDDB_DRVNMCNT) {
		for (i = 0; i < MDDB_DRVNMCNT; i++) {
			if (lbp->lb_drvnm[i].dn_len == 0)
				break;
		}
		if (i == MDDB_DRVNMCNT)
			return (1);
		(void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver,
		    MD_MAXDRVNM);
		lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver);
	}

	/* Fill in the drvnm index */
	if (mn_set) {
		mnslp->mnl_drvnm_index = i;
		mnslp->mnl_mnum = clp->l_mnum;
		mnslp->mnl_sideno = sideno;
	} else {
		slp->l_drvnm_index = i;
		slp->l_mnum = clp->l_mnum;
	}

	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
		/*
		 * This device id could already be associated with this index
		 * if this is not the first side added to the set.
		 * If device id is 0, there is no device id for this device.
		 */
		if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0)
			return (0);
		s = (mddb_set_t *)md_set[lbp->lb_setno].s_db;
		if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid,
		    clp->l_minor_name)) {
			return (1);
		}
	}

	return (0);
}

/*
 * See if there are mediator hosts and try to use the data.
 */
static int
mediate(
	mddb_set_t	*s
)
{
	mddb_lb_t	*lbp = s->s_lbp;
	med_data_lst_t	*meddlp = NULL;
	med_data_lst_t	*tmeddlp = NULL;
	med_data_t	*meddp;
	int		medok = 0;
	int		medacc = 0;
	uint_t		maxcc;
	int		golden = 0;
	int		err = 1;
	set_t		setno = s->s_setno;

	/* Do not have a mediator, then the state is stale */
	if (s->s_med.n_cnt == 0)
		return (err);

	/* Contact the mediator hosts for the data */
	meddlp = get_med_host_data(&s->s_med, s->s_setname, setno);

	/* No mediator data, stale */
	if (meddlp == NULL)
		return (err);

	/* Mark all the mediator data that is not for this set as errored */
	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
		struct timeval32 tmptime;
		meddp = tmeddlp->mdl_med;

		/* Count the number of mediators contacted */
		medacc++;

		/* Paranoid check */
		if (meddp->med_dat_sn != setno)
			meddp->med_dat_fl |= MED_DFL_ERROR;

		TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id);

		/*CSTYLED*/
		if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=))
			meddp->med_dat_fl |= MED_DFL_ERROR;
	}

	/* Get the max commitcount */
	maxcc = 0;
	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
		meddp = tmeddlp->mdl_med;
		if (meddp->med_dat_fl & MED_DFL_ERROR)
			continue;
		if (meddp->med_dat_cc > maxcc)
			maxcc = meddp->med_dat_cc;
	}

	/* Now mark the records that don't have the highest cc as errored */
	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
		meddp = tmeddlp->mdl_med;
		if (meddp->med_dat_fl & MED_DFL_ERROR)
			continue;
		if (meddp->med_dat_cc != maxcc)
			meddp->med_dat_fl |= MED_DFL_ERROR;
	}

	/* Now mark the records that don't match the lb commitcnt as errored */
	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
		meddp = tmeddlp->mdl_med;
		if (meddp->med_dat_fl & MED_DFL_ERROR)
			continue;
		if (meddp->med_dat_cc != lbp->lb_commitcnt)
			meddp->med_dat_fl |= MED_DFL_ERROR;
	}

	/* Is there a "golden" copy and how many valid mediators */
	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
		meddp = tmeddlp->mdl_med;
		if (meddp->med_dat_fl & MED_DFL_ERROR)
			continue;

		if (meddp->med_dat_fl & MED_DFL_GOLDEN)
			golden++;

		medok++;
	}

	/* No survivors, stale */
	if (medok == 0)
		goto out;

	/* No mediator quorum and no golden copies, stale */
	if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) {
		/* Skip odd numbers, no exact 50% */
		if (s->s_med.n_cnt & 1)
			goto out;
		/* Have 50%, allow an accept */
		if (medacc == (s->s_med.n_cnt / 2))
			md_set_setstatus(setno, MD_SET_ACCOK);
		goto out;
	}

	/* We either have a quorum or a golden copy, or both */
	err = 0;

out:
	if (meddlp) {
		for (/* void */; meddlp != NULL; meddlp = tmeddlp) {
			tmeddlp = meddlp->mdl_nx;
			kmem_free(meddlp->mdl_med, sizeof (med_data_t));
			kmem_free(meddlp, sizeof (med_data_lst_t));
		}
	}

	return (err);
}

/*
 *	1. read masterblks and locator blocks for all know database locations
 *		a. keep track of which have good master blks
 *		b. keep track of which have good locators
 *
 */
static int
get_mbs_n_lbs(
	mddb_set_t	*s,
	int		*write_lb
)
{
	mddb_lb_t	*lbp = NULL;		/* pointer to locator block */
						/* May be cast to mddb_mnlb_t */
						/* if accessing sidenames in */
						/* MN set */
	mddb_did_ic_t	*did_icp = NULL;	/* ptr to Device ID incore */
	mddb_did_blk_t	*did_blkp = 0;
	int		did_blkp_sz = 0;
	mddb_did_db_t	*did_dbp;
	mddb_did_info_t	*did_info;
	caddr_t		did_block;
	mddb_ri_t	*rip;
	mddb_dtag_lst_t	*dtlp;
	mddb_locator_t	*lp;
	daddr_t		physblk;
	int		li;
	uint_t		blk;
	md_dev64_t	dev;
	caddr_t		buffer;
	uint_t		lb_blkcnt;
	int		retval = 0;
	int		err = 0;
	int		lb_ok = 0;
	int		lb_total = 0;
	int		lb_tagged = 0;
	int		lb_tags;
	set_t		setno = s->s_setno;
	int		cont_flag, i;
	mddb_did_db_t	*did_dbp1, *did_dbp2;
	int		mn_set = 0;
	mddb_cfg_loc_t	*cl;

	/*
	 * read in master blocks and locator block for all known locators.
	 * lb_blkcnt will be set correctly for MN set later once getmasters
	 * has determined that the set is a MN set.
	 */
	lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT);

	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
		rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL |
		    MDDB_F_EMASTER);
		rip->ri_lbp = (mddb_lb_t *)NULL;
		rip->ri_did_icp = (mddb_did_ic_t *)NULL;

		/*
		 * Translated dev is only used in calls to getmasters and
		 * getblks which expect a translated (aka miniroot) dev.
		 */
		dev = md_xlate_targ_2_mini(rip->ri_dev);
		if (dev == NODEV64) {
			/* Set error flag that getmasters would have set */
			/* if getmasters had been allowed to fail */
			rip->ri_flags |= MDDB_F_EMASTER;
		}

		/*
		 * Invalid device id on system (due to failed or
		 * removed device) or invalid devt during upgrade
		 * (due to powered off device) will cause this
		 * replica to be marked in error and not used.
		 */
		if (rip->ri_flags & MDDB_F_EMASTER)
			continue;

		/* get all master blocks, does mddb_devopen() */
		rip->ri_mbip = getmasters(s, dev, rip->ri_blkno,
		    &rip->ri_flags, &mn_set);

		/* if invalid master block - try next replica */
		if (! rip->ri_mbip)
			continue;

		/*
		 * If lbp alloc'd to wrong size - reset it.
		 * If MN set, lb_blkcnt must be MDDB_MNLBCNT.
		 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT.
		 */
		if (lbp) {
			if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) ||
			    ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) {
				kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
				lbp = (mddb_lb_t *)NULL;
			}
		}

		if (lbp == (mddb_lb_t *)NULL) {
			/* If a MN set, set lb_blkcnt for MN loc blk size */
			if (mn_set)
				lb_blkcnt = MDDB_MNLBCNT;
			lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt),
			    KM_SLEEP);
		}

		/*
		 * Read in all the sectors for the locator block
		 * NOTE: Need to use getblks, rather than readblklst.
		 *	because it is too early and things are
		 *	NOT set up yet for read*()'s
		 */
		buffer = (caddr_t)lbp;
		for (blk = 0; blk < lb_blkcnt; blk++) {
			physblk = getphysblk(blk, rip->ri_mbip);
			err = getblks(s, buffer, dev, physblk,
			    btodb(MDDB_BSIZE), 0);
			if (err) {
				rip->ri_flags |= err;
				break;
			}
			buffer += MDDB_BSIZE;
		}

		if (err)
			continue;

		/* Verify the locator block */
		if (blk != lb_blkcnt)
			continue;
		if (lbp->lb_magic != MDDB_MAGIC_LB)
			continue;
		if (lbp->lb_blkcnt != lb_blkcnt)
			continue;
		if (mn_set) {
			/* If a MN set, check for MNLB revision in lb. */
			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
				continue;
		} else {
			/* If not a MN set, check for LB revision in lb. */
			if (revchk(MDDB_REV_LB, lbp->lb_revision))
				continue;
		}
		if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL))
			continue;

		/*
		 * With the addition of MultiNode Disksets, we must make sure
		 * to verify that this is the correct set.  A node could
		 * have been out of the config for awhile and this disk could
		 * have been moved to a different diskset and we don't want
		 * to accidentally start the wrong set.
		 *
		 * We don't do this check if we're in the middle of
		 * importing a set.
		 */
		if (!(md_get_setstatus(s->s_setno) &
		    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
		    (lbp->lb_setno != s->s_setno))
			continue;

		rip->ri_flags |= MDDB_F_LOCACC;

		/*
		 * a commit count of zero means this locator has been deleted
		 */
		if (lbp->lb_commitcnt == 0)
			continue;

		/*
		 * If replica is in the device ID style and md_devid_destroy
		 * flag is set, turn off device id style.  This is only to be
		 * used in a catastrophic failure case.  Examples would be
		 * where the device id of all drives in the system
		 * (especially the mirror'd root drives) had been changed
		 * by firmware upgrade or by a patch to an existing disk
		 * driver.  Another example would be in the case of non-unique
		 * device ids due to a bug.  The device id would be valid on
		 * the system, but would return the wrong dev_t.
		 */
		if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) {
			lbp->lb_flags &= ~MDDB_DEVID_STYLE;
			lbp->lb_didfirstblk = 0;
			lbp->lb_didblkcnt = 0;
			*write_lb = 1;
		}


		/*
		 * If replica is in device ID style, read in device ID
		 * block and verify device ID block information.
		 */
		if (lbp->lb_flags & MDDB_DEVID_STYLE) {

			/* Read in device ID block */
			if (did_icp == NULL) {
				did_icp = (mddb_did_ic_t *)
				    kmem_zalloc(sizeof (mddb_did_ic_t),
				    KM_SLEEP);
			} else {
				/* Reuse did_icp, but clear out data */
				if (did_icp->did_ic_blkp !=
				    (mddb_did_blk_t *)NULL) {
					kmem_free((caddr_t)did_icp->did_ic_blkp,
					    did_blkp_sz);
					did_blkp = (mddb_did_blk_t *)NULL;
					did_icp->did_ic_blkp =
					    (mddb_did_blk_t *)NULL;
				}
				if (did_icp->did_ic_dbp !=
				    (mddb_did_db_t *)NULL) {
					did_dbp1 = did_icp->did_ic_dbp;
					while (did_dbp1) {
						did_dbp2 = did_dbp1->db_next;
						kmem_free((caddr_t)
						    did_dbp1->db_ptr,
						    dbtob(did_dbp1->db_blkcnt));
						kmem_free((caddr_t)did_dbp1,
						    sizeof (mddb_did_db_t));
						did_dbp1 = did_dbp2;
					}
					did_icp->did_ic_dbp =
					    (mddb_did_db_t *)NULL;
				}
				for (i = 0; i < MDDB_NLB; i++) {
					did_icp->did_ic_devid[i] =
					    (ddi_devid_t)NULL;
				}
			}

			/* Can't reuse blkp since size could be different */
			if (did_blkp != (mddb_did_blk_t *)NULL) {
				kmem_free(did_blkp, did_blkp_sz);
			}
			did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt);
			did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz,
			    KM_SLEEP);
			did_icp->did_ic_blkp = did_blkp;
			buffer = (caddr_t)did_blkp;
			for (blk = lbp->lb_didfirstblk;
			    blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk);
			    blk++) {
				physblk = getphysblk(blk, rip->ri_mbip);
				err = getblks(s, buffer, dev, physblk,
				    btodb(MDDB_BSIZE), 0);
				if (err) {
					rip->ri_flags |= err;
					break;
				}
				buffer += MDDB_BSIZE;
			}
			if (err)
				continue;

			/* Verify the Device ID block */
			if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk))
				continue;
			if (did_blkp->blk_magic != MDDB_MAGIC_DI)
				continue;
			if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS)
				continue;
			if (revchk(MDDB_REV_DI, did_blkp->blk_revision))
				continue;
			if (crcchk(did_blkp, &did_blkp->blk_checksum,
			    dbtob(lbp->lb_didblkcnt), NULL))
				continue;

			/*
			 * Check if device ID block is out of sync with the
			 * Locator Block by checking if the locator block
			 * commitcnt does not match the device id block
			 * commitcnt.  If an 'out of sync' condition
			 * exists, discard this replica since it has
			 * inconsistent data and can't be used in
			 * determining the best replica.
			 *
			 * An 'out of sync' condition could happen if old
			 * SDS code was running with new devid style replicas
			 * or if a failure occurred between the writing of
			 * the locator block's commitcnt and the device
			 * id block's commitcnt.
			 *
			 * If old SDS code had been running, the upgrade
			 * process should detect this situation and
			 * have removed all of the device id information
			 * via the md_devid_destroy flag in md.conf.
			 */
			if (did_blkp->blk_commitcnt !=
			    lbp->lb_commitcnt) {
				continue;
			}
		}


		/*
		 * If replica is still in device ID style, read in all
		 * of the device IDs, verify the checksum of the device IDs.
		 */
		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
			/*
			 * Reset valid bit in device id info block flags. This
			 * flag is stored on disk, but the valid bit is reset
			 * when reading in the replica.  If the corresponding
			 * device id is valid (aka meaning that the system
			 * knows about this device id), the valid bit will
			 * be set at a later time.  The valid bit for this
			 * replica's device ID will be set in this routine.
			 * The valid bits for the rest of the device id's
			 * will be set after the 'best' replica has
			 * been selected in routine load_old_replicas.
			 * Reset updated bit in device id info block flags.
			 * This flag is also stored on disk, reset when read
			 * in and set when the locators and side locators
			 * have been updated to match this valid device
			 * id information.
			 */
			for (li = 0; li < lbp->lb_loccnt; li++) {
				did_info = &did_blkp->blk_info[li];
				if (did_info->info_flags & MDDB_DID_EXISTS)
					did_info->info_flags &=
					    ~(MDDB_DID_VALID |
					    MDDB_DID_UPDATED);
			}

			cont_flag = 0;
			for (li = 0; li < lbp->lb_loccnt; li++) {
				did_info = &did_blkp->blk_info[li];
				did_block = (caddr_t)NULL;
				if (did_info->info_flags & MDDB_DID_EXISTS) {
					/*
					 * Check if block has
					 * already been read in
					 */
					did_dbp = did_icp->did_ic_dbp;
					while (did_dbp != 0) {
						if (did_dbp->db_firstblk ==
						    did_info->info_firstblk)
							break;
						else
							did_dbp =
							    did_dbp->db_next;
					}
					/* if block not found, read it in */
					if (did_dbp == NULL) {
						did_block = (caddr_t)
						    (kmem_zalloc(dbtob(
						    did_info->info_blkcnt),
						    KM_SLEEP));
						buffer = (caddr_t)did_block;
						for (blk =
						    did_info->info_firstblk;
						    blk < (did_info->
						    info_firstblk +
						    did_info->info_blkcnt);
						    blk++) {
							physblk =
							    getphysblk(blk,
							    rip->ri_mbip);
							err = getblks(s,
							    buffer, dev,
							    physblk, btodb(
							    MDDB_BSIZE), 0);
							if (err) {
								rip->ri_flags |=
								    err;
								break;
							}
							buffer += MDDB_BSIZE;
						}
						if (err) {
							kmem_free(did_block,
							    dbtob(did_info->
							    info_blkcnt));
							did_block =
							    (caddr_t)NULL;
							cont_flag = 1;
							break;
						}

						/*
						 * Block read in -
						 * alloc Disk Block area
						 */
						did_dbp = (mddb_did_db_t *)
						    kmem_zalloc(
						    sizeof (mddb_did_db_t),
						    KM_SLEEP);
						did_dbp->db_ptr = did_block;
						did_dbp->db_firstblk =
						    did_info->info_firstblk;
						did_dbp->db_blkcnt =
						    did_info->info_blkcnt;

						/* Add to front of dbp list */
						did_dbp->db_next =
						    did_icp->did_ic_dbp;
						did_icp->did_ic_dbp = did_dbp;
					}
					/* Check validity of devid in block */
					if (crcchk(((char *)did_dbp->db_ptr +
					    did_info->info_offset),
					    &did_info->info_checksum,
					    did_info->info_length, NULL)) {
						cont_flag = 1;
						break;
					}

					/* Block now pointed to by did_dbp */
					did_icp->did_ic_devid[li] =
					    (ddi_devid_t)((char *)
					    did_dbp->db_ptr +
					    did_info->info_offset);
				}
			}
			if (cont_flag)
				continue;
		}

		/*
		 * All blocks containing devids are now in core.
		 */

		/*
		 * If we're doing a replicated import (also known as
		 * remote copy import), the device id in the locator
		 * block is incorrect and we need to fix it up here
		 * alongwith the l_dev otherwise we run into lots of
		 * trouble later on.
		 */
		if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
			mddb_ri_t	*trip;
			for (li = 0; li < lbp->lb_loccnt; li++) {
				did_info = &did_blkp->blk_info[li];
				lp = &lbp->lb_locators[li];

				if (lp->l_flags & MDDB_F_DELETED)
					continue;

				if (!(did_info->info_flags & MDDB_DID_EXISTS))
					continue;

				if (did_icp->did_ic_devid[li] == NULL)
					continue;

				for (trip = s->s_rip; trip != NULL;
				    trip = trip->ri_next) {
					if (trip->ri_old_devid == NULL)
						continue;
					if (ddi_devid_compare(
					    trip->ri_old_devid,
					    did_icp->did_ic_devid[li]) != 0) {
						continue;
					}

					/* update l_dev and side mnum */
					lp->l_dev = md_cmpldev(trip->ri_dev);
					lbp->lb_sidelocators[0][li].l_mnum =
					    md_getminor(trip->ri_dev);
				}
			}
		}

		/*
		 * If there is a valid devid, verify that this locator
		 * block has information about itself by checking the
		 * device ID, minor_name and block
		 * number from this replica's incore data structure
		 * against the locator block information that has just
		 * been read in from disk.
		 *
		 * If not a valid devid, verify that this locator block
		 * has information about itself by checking the minor
		 * number, block number and driver name from this
		 * replica's incore data structure against the locator
		 * block information that has just been read in from disk.
		 */
		if ((rip->ri_devid != NULL) &&
		    (lbp->lb_flags & MDDB_DEVID_STYLE)) {
			/*
			 * This locator block MUST have locator (replica)
			 * information about itself.  Check against devid,
			 * slice part of minor number, and block number.
			 */
			for (li = 0; li < lbp->lb_loccnt; li++) {
				did_info = &did_blkp->blk_info[li];
				lp = &lbp->lb_locators[li];
				if (lp->l_flags & MDDB_F_DELETED)
					continue;

				if (!(did_info->info_flags & MDDB_DID_EXISTS))
					continue;

				if (((md_get_setstatus(setno) &
				    MD_SET_REPLICATED_IMPORT)) &&
				    (rip->ri_old_devid != (ddi_devid_t)NULL)) {
					if (ddi_devid_compare(rip->ri_old_devid,
					    did_icp->did_ic_devid[li]) != 0)
						continue;
				} else {
					if (ddi_devid_compare(rip->ri_devid,
					    did_icp->did_ic_devid[li]) != 0)
						continue;
				}

				if (strcmp(rip->ri_minor_name,
				    did_info->info_minor_name) != 0)
					continue;

				if (lp->l_blkno == rip->ri_blkno)
					break;
			}
		} else {
			/*
			 * This locator block MUST have locator (replica)
			 * information about itself.
			 */
			if (!mn_set) {
				for (li = 0; li < lbp->lb_loccnt; li++) {
					mddb_drvnm_t		*dn;
					mddb_sidelocator_t	*slp;

					lp = &lbp->lb_locators[li];
					slp = &lbp->
					    lb_sidelocators[s->s_sideno][li];
					if (lp->l_flags & MDDB_F_DELETED)
						continue;
					if (slp->l_mnum != md_getminor(
					    rip->ri_dev))
						continue;
					if (lp->l_blkno != rip->ri_blkno)
						continue;
					dn = &lbp->lb_drvnm[slp->l_drvnm_index];
					if (strncmp(dn->dn_data,
					    rip->ri_driver, MD_MAXDRVNM) == 0)
						break;
				}
			} else {
				for (li = 0; li < lbp->lb_loccnt; li++) {
					mddb_drvnm_t		*dn;
					mddb_mnsidelocator_t	*mnslp;
					mddb_mnlb_t		*mnlbp;
					int			i;

					/*
					 * Check all possible locators locking
					 * for match to the currently read-in
					 * locator, must match on:
					 *	- blkno
					 *	- side locator for this
					 *	  node's side
					 *	- side locator minor number
					 *	- side locator driver name
					 */

					/*
					 * Looking at sidelocs:
					 * cast lbp -> mnlbp
					 */
					mnlbp = (mddb_mnlb_t *)lbp;
					lp = &mnlbp->lb_locators[li];
					if (lp->l_flags & MDDB_F_DELETED)
						continue;
					if (lp->l_blkno != rip->ri_blkno)
						continue;

					for (i = 0; i < MD_MNMAXSIDES; i++) {
						mnslp = &mnlbp->
						    lb_mnsidelocators[i][li];
						if (mnslp->mnl_sideno ==
						    s->s_sideno) {
							break;
						}
					}
					/* No matching side found */
					if (i == MD_MNMAXSIDES)
						continue;
					if (mnslp->mnl_mnum !=
					    md_getminor(rip->ri_dev))
						continue;
					dn = &lbp->
					    lb_drvnm[mnslp->mnl_drvnm_index];
					if (strncmp(dn->dn_data,
					    rip->ri_driver, MD_MAXDRVNM) == 0)
						break;
				}
			}
		}

		/*
		 * Didn't find ourself in this locator block it means
		 * the locator block is a stale transplant. Probably from
		 * a user doing a dd.
		 */
		if (li == lbp->lb_loccnt)
			continue;

		/*
		 * Keep track of the number of accessed and valid
		 * locator blocks.
		 */
		lb_ok++;

		/*
		 * Read the tag in, skips invalid or blank tags.
		 * Only valid tags allocate storage
		 * Data tags are not used in MN disksets.
		 */
		if ((!mn_set) && (! dt_read(s, lbp, rip))) {
			/*
			 * Keep track of the number of tagged
			 * locator blocks.
			 */
			lb_tagged++;

			/* Keep a list of unique tags. */
			(void) dtl_addl(s, &rip->ri_dtp->dt_dtag);
		}

		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
			/*
			 * go through locator block and add any other
			 * locations of the data base.
			 * For the replicated import case, this was done earlier
			 * and we really don't need or want to do so again
			 */
			cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
			for (li = 0; li < lbp->lb_loccnt; li++) {
				lp = &lbp->lb_locators[li];
				if (lp->l_flags & MDDB_F_DELETED)
					continue;

				cl->l_devid_flags = MDDB_DEVID_GETSZ;
				cl->l_devid = (uint64_t)0;
				cl->l_devid_sz = 0;
				cl->l_old_devid = (uint64_t)0;
				cl->l_old_devid_sz = 0;
				cl->l_minor_name[0] = '\0';
				locator2cfgloc(lbp, cl, li, s->s_sideno,
				    did_icp);

				if (cl->l_devid_flags & MDDB_DEVID_SZ) {
					if ((cl->l_devid = (uintptr_t)kmem_alloc
					    (cl->l_devid_sz, KM_SLEEP))
					    == NULL) {
						continue;
					} else {
						cl->l_devid_flags =
						    MDDB_DEVID_SPACE;
					}
				}
				locator2cfgloc(lbp, cl, li, s->s_sideno,
				    did_icp);

				(void) ridev(&s->s_rip, cl, &lp->l_dev, 0);

				if (cl->l_devid_flags & MDDB_DEVID_SPACE)
					kmem_free((caddr_t)(uintptr_t)
					    cl->l_devid, cl->l_devid_sz);
			}
			kmem_free(cl, sizeof (mddb_cfg_loc_t));
		}

		/* Save LB for later */
		rip->ri_lbp = lbp;
		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
			rip->ri_did_icp = did_icp;
			did_icp = (mddb_did_ic_t *)NULL;
			did_blkp = (mddb_did_blk_t *)NULL;
		} else
			rip->ri_did_icp = NULL;
		lbp = (mddb_lb_t *)NULL;
	}

	if (lbp != (mddb_lb_t *)NULL)
		kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));

	if (did_icp != (mddb_did_ic_t *)NULL) {
		if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) {
			kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz);
			did_blkp = (mddb_did_blk_t *)NULL;
		}
		if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) {
			mddb_did_db_t	*did_dbp1, *did_dbp2;

			did_dbp1 = did_icp->did_ic_dbp;
			while (did_dbp1) {
				did_dbp2 = did_dbp1->db_next;
				kmem_free((caddr_t)did_dbp1->db_ptr,
				    dbtob(did_dbp1->db_blkcnt));
				kmem_free((caddr_t)did_dbp1,
				    sizeof (mddb_did_db_t));
				did_dbp1 = did_dbp2;
			}
		}
		kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t));
	}

	if (did_blkp != (mddb_did_blk_t *)NULL) {
		kmem_free((caddr_t)did_blkp, did_blkp_sz);
	}

	/* No locator blocks were ok */
	if (lb_ok == 0)
		goto out;

	/* No tagged data was found - will be 0 for MN diskset */
	if (lb_tagged == 0)
		goto out;

	/* Find the highest non-deleted replica count */
	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
		int		lb_tot = 0;

		if (rip->ri_mbip == (mddb_mb_ic_t *)NULL)
			continue;

		if (rip->ri_lbp == (mddb_lb_t *)NULL)
			continue;

		for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) {
			lp = &rip->ri_lbp->lb_locators[li];
			if (lp->l_flags & MDDB_F_DELETED)
				continue;
			lb_tot++;
		}

		if (lb_tot > lb_total)
			lb_total = lb_tot;
	}

	/* Count the number of unique tags */
	for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx)
		lb_tags++;

	/* Should have at least one tag at this point */
	ASSERT(lb_tags > 0);


	/*
	 * If the number of tagged locators is not the same as the number of
	 * OK locators OR more than one tag exists, then make sure the
	 * selected tag will be written out later.
	 */
	if ((lb_tagged - lb_ok) != 0 || lb_tags > 1)
		md_set_setstatus(setno, MD_SET_TAGDATA);

	/* Only a single tag, take the tagged data */
	if (lb_tags == 1) {
		dt_setup(s, &s->s_dtlp->dtl_dt);
		md_set_setstatus(setno, MD_SET_USETAG);
		goto out;
	}

	/* Multiple tags, not selecting a tag, tag mode is on */
	if (! (md_get_setstatus(setno) & MD_SET_USETAG))
		retval = MDDB_E_TAGDATA;

out:

	return (retval);
}

/*
 *	1. Select a locator.
 *	2. check if enough locators now have current copies
 *	3. read in database from one of latest
 *	4. if known to have latest make all database the same
 *	5. if configuration has changed rewrite locators
 *
 * Parameters:
 * 	s - pointer to mddb_set structure
 *	flag - used in MN disksets to tell if this node is being joined to
 *		a diskset that is in the STALE state.  If the flag is
 *		MDDB_MN_STALE, then this node should be marked in the STALE
 *		state even if > 50% mddbs are available.  (The diskset can
 *		only change from STALE->OK if all nodes withdraw from the
 *		MN diskset and then rejoin).
 */
static int
load_old_replicas(
	mddb_set_t	*s,
	int		flag
)
{
	mddb_lb_t	*lbp = NULL;
	mddb_mnlb_t	*mnlbp = NULL;
	mddb_ri_t	*rip;
	mddb_locator_t	*lp;
	mddb_db_t	*dbp;
	mddb_de_ic_t	*dep;
	int		li;
	int		alc;
	int		lc;
	int		tlc;
	int		retval = 0;
	caddr_t		p;
	size_t		maxrecsize;
	set_t		setno = s->s_setno;
	mddb_did_db_t	*did_dbp1;
	mddb_did_info_t	*did_info;
	mddb_did_ic_t	*did_icp = NULL;
	md_dev64_t	*newdev;
	mddb_sidelocator_t	*slp = 0;
	mddb_mnsidelocator_t	*mnslp = 0;
	uchar_t		i;
	char		*name;
	ddi_devid_t	ret_devid;
	md_dev64_t	dev;
	uint_t		len, sz;
	char		*minor_name;
	int		write_lb = 0;
	int		rval;
	int		stale_rtn = 0;

	/* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */
	if (retval = get_mbs_n_lbs(s, &write_lb))
		goto errout;

	if ((lbp = s->s_lbp = selectlocator(s)) == NULL) {
		retval = MDDB_E_NOLOCBLK;
		goto errout;
	}

	/* If a multi-node set, then set md_set.s_status flag */
	if (lbp->lb_flags & MDDB_MNSET) {
		md_set_setstatus(setno, MD_SET_MNSET);
		/*
		 * If data tag area had been allocated before set type was
		 * known - free it now.
		 */
		if (md_set[setno].s_dtp) {
			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
			md_set[setno].s_dtp = NULL;
		}
	}

	/*
	 * If the replica is in devid format, setup the devid incore ptr.
	 */
	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
			if (rip->ri_lbp == s->s_lbp) {
				did_icp = s->s_did_icp = rip->ri_did_icp;
				break;
			}
		}
		/*
		 * If no devid incore info found - something has gone
		 * wrong so errout.
		 */
		if (rip == NULL) {
			retval = MDDB_E_NODEVID;
			goto errout;
		}

		/*
		 * Add all blocks containing devids to free list.
		 * Then remove addresses that actually contain devids.
		 */
		did_dbp1 = did_icp->did_ic_dbp;
		while (did_dbp1) {
			if (mddb_devid_free_add(s, did_dbp1->db_firstblk,
			    0, dbtob(did_dbp1->db_blkcnt))) {
				retval = MDDB_E_NOSPACE;
				goto errout;
			}

			did_dbp1 = did_dbp1->db_next;
		}
		for (li = 0; li < lbp->lb_loccnt; li++) {
			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
			if (!(did_info->info_flags & MDDB_DID_EXISTS))
				continue;

			if (mddb_devid_free_delete(s, did_info->info_firstblk,
			    did_info->info_offset, did_info->info_length)) {
				/* unable to find disk block */
				retval = MDDB_E_NODEVID;
				goto errout;
			}
		}
	}

	/*
	 * create mddb_mbaray, count all locators and active locators.
	 */
	alc = 0;
	lc = 0;
	for (li = 0; li < lbp->lb_loccnt; li++) {
		ddi_devid_t	li_devid;

		lp = &lbp->lb_locators[li];

		if (lp->l_flags & MDDB_F_DELETED)
			continue;

		/* Count non-deleted replicas */
		lc++;

		/*
		 * Use the devid of this locator to compare with the rip
		 * list.  The scenario to watch out for here is that this
		 * locator could be on a disk that is dead and there could
		 * be a valid entry in the rip list for a different disk
		 * that has been moved to the dead disks dev_t.  We don't
		 * want to match with the moved disk.
		 */
		li_devid = NULL;
		(void) mddb_devid_get(s, li, &li_devid, &minor_name);

		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
			if (match_mddb(rip, li_devid, minor_name,
			    md_expldev(lp->l_dev), lp->l_blkno)) {
				break;
			}
		}
		if (rip == NULL) {
			/*
			 * If rip not found, then mark error in master block
			 * so that no writes are later attempted to this
			 * replica.  rip may not be setup if ridev
			 * failed due to un-found driver name.
			 */
			lp->l_flags |= MDDB_F_EMASTER;
			continue;
		}

		s->s_mbiarray[li] = rip->ri_mbip;

		lp->l_flags &= MDDB_F_ACTIVE;
		lp->l_flags |= (int)rip->ri_flags;

		if (rip->ri_transplant)
			lp->l_flags &= ~MDDB_F_ACTIVE;

		if (lp->l_flags & MDDB_F_LOCACC)
			alc++;
	}

	/* Save on a divide - calculate 50% + 1 up front */
	tlc = ((lc + 1) / 2);

	if (alc > tlc) {		/* alc > tlc		- OK */
		md_clr_setstatus(setno, MD_SET_STALE);
	} else if (alc < tlc) {		/* alc < tlc		- stale */
		md_set_setstatus(setno, MD_SET_STALE);
	} else if (lc & 1) {		/* alc == tlc && odd	- OK */
		md_clr_setstatus(setno, MD_SET_STALE);
	} else {			/* alc == tlc && even	- ? */
		/* Can do an accept, and are */
		if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) {
			md_clr_setstatus(setno, MD_SET_STALE);
		} else {		/* possibly has a mediator */
			if (mediate(s)) {
				md_set_setstatus(setno, MD_SET_STALE);
			} else {
				md_clr_setstatus(setno, MD_SET_STALE);
			}
		}

		/*
		 * The mirrored_root_flag allows the sysadmin to decide to
		 * start the local set in a read/write (non-stale) mode
		 * when there are only 50% available mddbs on the system and
		 * when the root file system is on a mirror.  This is useful
		 * in a 2 disk system where 1 disk failure would cause an mddb
		 * quorum failure and subsequent boot failures since the root
		 * filesystem would be in a read-only state.
		 */
		if (mirrored_root_flag == 1 && setno == 0 &&
		    svm_bootpath[0] != 0) {
			md_clr_setstatus(setno, MD_SET_STALE);
		} else {
			if (md_get_setstatus(setno) & MD_SET_STALE) {
				/* Allow half mode - CAREFUL! */
				if (mddb_allow_half)
					md_clr_setstatus(setno, MD_SET_STALE);
			}
		}

		/*
		 * In a MN diskset,
		 *	- if 50% mddbs are unavailable and this
		 *		has been marked STALE above
		 * 	- master node isn't in the STALE state
		 *	- this node isn't the master node (this node
		 *		isn't the first node to join the set)
		 * then clear the STALE state and set TOOFEW.
		 *
		 * If this node is the master node and set was marked STALE,
		 * then the set stays STALE.
		 *
		 * If this node is not the master and this node's state is
		 * STALE and the master node is not marked STALE,
		 * then master node must be in the TOOFEW state or the
		 * master is panic'ing.  A MN diskset can only be placed into
		 * the STALE state by having the first node join the set
		 * with <= 50% mddbs.  There's no way for a MN diskset to
		 * transition between STALE and not-STALE states unless all
		 * nodes are withdrawn from the diskset or all nodes in the
		 * diskset are rebooted at the same time.
		 *
		 * So, mark this node's state as TOOFEW instead of STALE.
		 */
		if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE))
		    == (MD_SET_MNSET | MD_SET_STALE)) &&
		    ((flag & MDDB_MN_STALE) == 0) &&
		    (!(md_set[setno].s_am_i_master))) {
			md_clr_setstatus(setno, MD_SET_STALE);
			md_set_setstatus(setno, MD_SET_TOOFEW);
		}
	}

	/*
	 * If a MN set is marked STALE on the other nodes,
	 * mark it stale here.  Override all other considerations
	 * such as a mediator or > 50% mddbs available.
	 */
	if (md_get_setstatus(setno) & MD_SET_MNSET) {
		if (flag & MDDB_MN_STALE)
			md_set_setstatus(setno, MD_SET_STALE);
	}

	/*
	 * read a good copy of the locator names
	 * if an error occurs reading what is suppose
	 * to be a good copy continue looking for another
	 * good copy
	 */
	s->s_lnp = NULL;
	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];
		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
		    (lp->l_flags & MDDB_F_EMASTER))
			continue;

		/* Find rip entry for this locator if one exists */
		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
			    lp->l_blkno))
				break;
		}

		if (rip == NULL) {
			continue;
		}

		/*
		 * Use the rip commitcnt since the commitcnt in lbp could
		 * been cleared by selectlocator.  Looking for a replica with
		 * the same commitcnt as the 'golden' copy in order to
		 * get the same data.
		 */
		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
			continue;
		}

		/*
		 * Now have a copy of the database that is equivalent
		 * to the chosen locator block with respect to
		 * inittime, identifier and commitcnt.   Trying the
		 * equivalent databases in the order that they were
		 * written will provide the most up to date data.
		 */
		lp->l_flags |= readlocnames(s, li);
		if (s->s_lnp)
			break;
	}

	if (s->s_lnp == NULL) {
		retval = MDDB_E_NOLOCNMS;
		goto errout;
	}

	/*
	 * read a good copy of the data base
	 * if an error occurs reading what is suppose
	 * to be a good copy continue looking for another
	 * good copy
	 */

	s->s_dbp = NULL;
	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];
		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
		    (lp->l_flags & MDDB_F_EMASTER))
			continue;

		/* Find rip entry for this locator if one exists */
		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
			    lp->l_blkno))
				break;
		}

		if (rip == NULL) {
			continue;
		}

		/*
		 * Use the rip commitcnt since the commitcnt in lbp could
		 * been cleared by selectlocator.  Looking for a replica with
		 * the same commitcnt as the 'golden' copy in order to
		 * get the same data.
		 */
		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
			continue;
		}

		/*
		 * Now have a copy of the database that is equivalent
		 * to the chosen locator block with respect to
		 * inittime, identifier and commitcnt.   Trying the
		 * equivalent databases in the order that they were
		 * written will provide the most up to date data.
		 */
		lp->l_flags |= readcopy(s, li);

		if (s->s_dbp)
			break;
	}

	if (s->s_dbp == NULL) {
		retval = MDDB_E_NODIRBLK;
		goto errout;
	}

	lp->l_flags |= MDDB_F_MASTER;
	lp->l_flags |= MDDB_F_UP2DATE;

	/*
	 * go through and find largest record;
	 * Also fixup the user data area's
	 */
	maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size);

	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
			if (dep->de_flags & MDDB_F_OPT)
				getoptrecord(s, dep);
			else {
				allocuserdata(dep);
				maxrecsize = MAX(dep->de_recsize, maxrecsize);
			}

	if (maxrecsize > s->s_databuffer_size) {
		p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
		if (s->s_databuffer_size)
			kmem_free(s->s_databuffer, s->s_databuffer_size);
		s->s_databuffer = p;
		s->s_databuffer_size = maxrecsize;
	}

	/* If we can clear the tag data record, do it now. */
	/* Data tags not supported on MN sets */
	if ((md_get_setstatus(setno) & MD_SET_CLRTAG) &&
	    (!(md_get_setstatus(setno) & MD_SET_MNSET)))
		dt_setup(s, NULL);

	/* This will return non-zero if STALE or TOOFEW */
	/* This will write out chosen replica image to all replicas */
	stale_rtn = selectreplicas(s, MDDB_SCANALL);

	if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
		ddi_devid_t	devidptr;

		/*
		 * ignore the return value from selectreplicas because we
		 * may have a STALE or TOOFEW set in the case of a partial
		 * replicated diskset. We will fix that up later.
		 */

		lbp = s->s_lbp;
		for (li = 0; li < lbp->lb_loccnt; li++) {
			did_info = &(did_icp->did_ic_blkp->blk_info[li]);

			if (did_info->info_flags & MDDB_DID_EXISTS) {
				devidptr = s->s_did_icp->did_ic_devid[li];
				lp = &lbp->lb_locators[li];
				for (rip = s->s_rip; rip != NULL;
				    rip = rip->ri_next) {
					if (rip->ri_old_devid == 0)
						continue;
					if (ddi_devid_compare(rip->ri_old_devid,
					    devidptr) != 0) {
						continue;
					}
					if (update_locatorblock(s,
					    md_expldev(lp->l_dev),
					    rip->ri_devid, rip->ri_old_devid)) {
						goto errout;
					}
				}
			}
		}
	} else {
		if (stale_rtn)
			goto errout;
	}

	/*
	 * If the replica is in device id style - validate the device id's,
	 * if present, in the locator block devid area.
	 */
	newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP);
	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
		for (li = 0; li < lbp->lb_loccnt; li++) {
			newdev[li] = 0;
			lp = &lbp->lb_locators[li];
			if (lp->l_flags & MDDB_F_DELETED)
				continue;
			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
			dev = md_expldev(lp->l_dev);
			if (did_info->info_flags & MDDB_DID_EXISTS) {
				/* Validate device id on current system */
				newdev[li] = dev;
				if (mddb_devid_validate(
				    did_icp->did_ic_devid[li],
				    &(newdev[li]),
				    did_info->info_minor_name) == 0) {
					/* Set valid flag */
					did_info->info_flags |= MDDB_DID_VALID;
				} else {
					lp->l_flags |= MDDB_F_EMASTER;
				}
			} else if (!(MD_UPGRADE)) {
				/*
				 * If a device doesn't have a device id,
				 * check if there is now a device ID
				 * associated with device.  If one exists,
				 * add it to the locator block devid area.
				 * If there's not enough space to add it,
				 * print a warning.
				 * Don't do this during upgrade.
				 */
				dev_t ddi_dev = md_dev64_to_dev(dev);
				if (ddi_lyr_get_devid(ddi_dev, &ret_devid) ==
				    DDI_SUCCESS) {
					if (ddi_lyr_get_minor_name(ddi_dev,
					    S_IFBLK, &minor_name)
					    == DDI_SUCCESS) {
						if (mddb_devid_add(s, li,
						    ret_devid, minor_name)) {
							cmn_err(CE_WARN,
							    "Not enough space"
							    " in metadevice"
							    " state"
							    " database\n");
							cmn_err(CE_WARN,
							    "to add relocation"
							    " information for"
							    " device:\n");
							cmn_err(CE_WARN,
							    " major = %d, "
							    " minor = %d\n",
							    getmajor(ddi_dev),
							    getminor(ddi_dev));
						} else {
							write_lb = 1;
						}
						kmem_free(minor_name,
						    strlen(minor_name) + 1);
					}
					ddi_devid_free(ret_devid);
				}
			}
		}

		/*
		 * If a device has a valid device id and if the dev_t
		 * associated with the device id has changed, update the
		 * driver name, minor num and dev_t in the local and side
		 * locators to match the dev_t that the system currently
		 * associates with the device id.
		 *
		 * Don't do this during upgrade.
		 */
		if (!(MD_UPGRADE)) {
			for (li = 0; li < lbp->lb_loccnt; li++) {
				lp = &lbp->lb_locators[li];
				if (lp->l_flags & MDDB_F_DELETED)
					continue;
				did_info = &(did_icp->did_ic_blkp->blk_info
				    [li]);
				if ((did_info->info_flags & MDDB_DID_VALID) &&
				    !(did_info->info_flags &
				    MDDB_DID_UPDATED)) {
					if (lbp->lb_flags & MDDB_MNSET) {
						int j;
						int index = -1;
						mnlbp = (mddb_mnlb_t *)lbp;
						for (j = 0; j < MD_MNMAXSIDES;
						    j++) {
							mnslp = &mnlbp->
							    lb_mnsidelocators[j]
							    [li];
							if (mnslp->mnl_sideno ==
							    s->s_sideno)
								break;
							if (mnslp->mnl_sideno ==
							    0)
								index = j;
						}
						if (j == MD_MNMAXSIDES) {
							/*
							 * No match found; take
							 * empty
							 */
							mnslp = &mnlbp->
							    lb_mnsidelocators
							    [index][li];
							write_lb = 1;
							mnslp->mnl_mnum =
							    md_getminor(newdev
							    [li]);
						} else if (mnslp->mnl_mnum !=
						    md_getminor(newdev[li])) {
							write_lb = 1;
							mnslp->mnl_mnum =
							    md_getminor(newdev
							    [li]);
						}
					} else {
						slp = &lbp->
						    lb_sidelocators[s->s_sideno]
						    [li];
						if (slp->l_mnum !=
						    md_getminor(newdev[li])) {
							write_lb = 1;
							slp->l_mnum =
							    md_getminor(newdev
							    [li]);
						}
					}
					name = ddi_major_to_name(md_getmajor(
					    newdev[li]));
					if (lbp->lb_flags & MDDB_MNSET)
						i = mnslp->mnl_drvnm_index;
					else
						i = slp->l_drvnm_index;
					if (strncmp(lbp->lb_drvnm[i].dn_data,
					    name, lbp->lb_drvnm[i].dn_len) !=
					    0) {
						/* Driver name has changed */
						len = strlen(name);
						/* Look for the driver name */
						for (i = 0; i < MDDB_DRVNMCNT;
						    i++) {
							if (lbp->lb_drvnm[i].
							    dn_len != len)
								continue;
							if (strncmp(lbp->
							    lb_drvnm[i].dn_data,
							    name, len) == 0)
								break;
						}
						/* Didn't find one, add it */
						if (i == MDDB_DRVNMCNT) {
							for (i = 0; i <
							    MDDB_DRVNMCNT;
							    i++) {
								if (lbp->
								    lb_drvnm[i].
								    dn_len == 0)
									break;
							}
							if (i ==
							    MDDB_DRVNMCNT) {
								cmn_err(CE_WARN,
								    "Unable to "
								    " update "
								    "driver "
								    " name for "
								    "dev:  "
								    "major = %d"
								    ", minor = "
								    "%d\n",
								    md_getmajor(
								    newdev[li]),
								    md_getminor(
								    newdev
								    [li]));
								continue;
							}
							(void) strncpy(lbp->
							    lb_drvnm[i].dn_data,
							    name, MD_MAXDRVNM);
							lbp->lb_drvnm[i].
							    dn_len = (uchar_t)
							    strlen(name);
						}
						/* Fill in the drvnm index */
						if (lbp->lb_flags &
						    MDDB_MNSET)
							mnslp->mnl_drvnm_index =
							    i;
						else
							slp->l_drvnm_index = i;
						write_lb = 1;
					}
					did_info->info_flags |=
					    MDDB_DID_UPDATED;
				}
			}
		}
	}
	kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB);

	/*
	 * If locator block has been changed by get_mbs_n_lbs,
	 * by addition of new device id, by updated minor name or
	 * by updated driver name - write out locator block.
	 */
	if (write_lb) {
		rval = push_lb(s);
		(void) upd_med(s, "load_old_replicas(0)");
		if (rval)
			goto errout;
	}

	/*
	 * If the tag was moved, allocated, or a BADTAG was seen for some other
	 * reason, then make sure tags are written to all the replicas.
	 * Data tags not supported on MN sets.
	 */
	if (!(md_get_setstatus(setno) & MD_SET_MNSET)) {
		if (! (lc = dt_alloc_if_needed(s))) {
			for (li = 0; li < lbp->lb_loccnt; li++) {
				lp = &lbp->lb_locators[li];

				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
				    (lp->l_flags & MDDB_F_EMASTER))
					continue;

				if (lp->l_flags & MDDB_F_BADTAG) {
					lc = 1;
					break;
				}
			}
		}

		if (lc) {
			md_set_setstatus(setno, MD_SET_TAGDATA);
			md_clr_setstatus(setno, MD_SET_BADTAG);
			(void) selectreplicas(s, MDDB_SCANALL);
		}
	}

errout:

	/* Free extraneous rip components. */
	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
		/* Get rid of lbp's and dtp's */

		if (rip->ri_lbp != lbp) {
			if (rip->ri_dtp != (mddb_dt_t *)NULL) {
				kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES);
				rip->ri_dtp = (mddb_dt_t *)NULL;
			}

			if (rip->ri_devid != (ddi_devid_t)NULL) {
				sz = (int)ddi_devid_sizeof(rip->ri_devid);
				kmem_free((caddr_t)rip->ri_devid, sz);
				rip->ri_devid = (ddi_devid_t)NULL;
			}
			if (rip->ri_old_devid != (ddi_devid_t)NULL) {
				sz = (int)ddi_devid_sizeof(rip->ri_old_devid);
				kmem_free((caddr_t)rip->ri_old_devid, sz);
				rip->ri_old_devid = (ddi_devid_t)NULL;
			}

			if (rip->ri_lbp != (mddb_lb_t *)NULL) {
				mddb_devid_icp_free(&rip->ri_did_icp,
				    rip->ri_lbp);

				kmem_free((caddr_t)rip->ri_lbp,
				    dbtob(rip->ri_lbp->lb_blkcnt));
				rip->ri_lbp = (mddb_lb_t *)NULL;
			}
		}

		if (lbp != NULL) {
			for (li = 0; li < lbp->lb_loccnt; li++) {
				lp = &lbp->lb_locators[li];
				if (lp->l_flags & MDDB_F_DELETED)
					continue;
				if (rip->ri_dev == md_expldev(lp->l_dev) &&
				    rip->ri_blkno == lp->l_blkno)
					break;
			}
			if (li < lbp->lb_loccnt)
				continue;
		}

		/*
		 * Get rid of mbp's:
		 *	if lbp, those out of lb_loccnt bounds
		 *	if !lbp,  all of them.
		 */
		if (rip->ri_mbip) {
			md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev);
			if (dev64 != NODEV64)
				mddb_devclose(dev64);

			free_mbipp(&rip->ri_mbip);
		}
		/*
		 * Turn off MDDB_F_EMASTER flag in a diskset since diskset
		 * code always ends up calling ridev for all replicas
		 * before calling load_old_replicas.  ridev will reset
		 * MDDB_F_EMASTER flag if flag was due to unresolved devid.
		 */
		if (setno != MD_LOCAL_SET)
			rip->ri_flags &= ~MDDB_F_EMASTER;
	}
	return (retval);
}

/*
 * Given the devt from the md.conf info, get the devid for the device.
 */
static void
lookup_db_devid(mddb_cfg_loc_t *cl)
{
	dev_t		ldev;
	ddi_devid_t	devid;
	char		*minor;

	if (ddi_name_to_major(cl->l_driver) == (major_t)-1) {
		cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver);
		return;
	}

	ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum);
	if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) {
		cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x",
		    cl->l_driver, cl->l_mnum);
		return;
	}

	if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) {
		cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x",
		    cl->l_mnum);
		return;
	}

	cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ;
	cl->l_devid_sz = (int)ddi_devid_sizeof(devid);
	cl->l_devid = (uint64_t)(uintptr_t)devid;
	(void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX);

	kmem_free(minor, strlen(minor) + 1);
}

/*
 * grab driver name, minor, block and devid out of
 * strings like "driver:minor:block:devid"
 */
static int
parse_db_loc(
	char		*str,
	mddb_cfg_loc_t	*clp
)
{
	char		*p, *e;
	char		*minor_name;
	ddi_devid_t	ret_devid;

	clp->l_dev = 0;
	p = clp->l_driver;
	e = p + sizeof (clp->l_driver) - 1;
	while ((*str != ':') && (*str != '\0') && (p < e))
		*p++ = *str++;
	*p = '\0';
	if (*str++ != ':')
		return (-1);
	clp->l_mnum = 0;
	while (ISNUM(*str)) {
		clp->l_mnum *= 10;
		clp->l_mnum += *str++ - '0';
	}
	if (*str++ != ':')
		return (-1);
	clp->l_blkno = 0;
	while (ISNUM(*str)) {
		clp->l_blkno *= 10;
		clp->l_blkno += *str++ - '0';
	}
	if (*str++ != ':')
		return (-1);

	/*
	 * If the md_devid_destroy flag is set, ignore the device ids.
	 * This is only to used in a catastrophic failure case.  Examples
	 * would be where the device id of all drives in the system
	 * (especially the mirror'd root drives) had been changed
	 * by firmware upgrade or by a patch to an existing disk
	 * driver.  Another example would be in the case of non-unique
	 * device ids due to a bug.  The device id would be valid on
	 * the system, but would return the wrong dev_t.
	 */
	if (md_devid_destroy) {
		clp->l_devid_flags = 0;
		clp->l_devid = (uint64_t)NULL;
		clp->l_devid_sz = 0;
		clp->l_old_devid = (uint64_t)NULL;
		clp->l_old_devid_sz = 0;
		clp->l_minor_name[0] = '\0';
		return (0);
	}

	if (ddi_devid_str_decode(str,
	    (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE)
		return (-1);

	clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
	clp->l_devid_flags = 0;
	clp->l_old_devid = (uint64_t)NULL;
	clp->l_old_devid_sz = 0;

	/* If no device id associated with device, just return */
	if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) {
		clp->l_devid_sz = 0;
		clp->l_minor_name[0] = '\0';
		if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 &&
		    md_keep_repl_state == 0) {
			/*
			 * No devid in md.conf; we're in recovery mode so
			 * lookup the devid for the device as specified by
			 * the devt in md.conf.
			 */
			lookup_db_devid(clp);
		}
		return (0);
	}

	clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID |
	    MDDB_DEVID_SZ;
	clp->l_devid_sz = (int)ddi_devid_sizeof(
	    (ddi_devid_t)(uintptr_t)clp->l_devid);
	(void) strcpy(clp->l_minor_name, minor_name);
	kmem_free(minor_name, strlen(minor_name) + 1);

	return (0);
}

/*
 * grab driver name, minor, and block out of
 * strings like "driver:minor:block:devid driver:minor:block:devid ..."
 */
static void
parse_db_string(
	char		*str
)
{
	char		*p, *e;
	mddb_cfg_loc_t	*cl;
	char		restore_space;

	/* CSTYLED */
	cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
	for (p = str; (*p != '\0'); ) {
		for (; ((*p != '\0') && (ISWHITE(*p))); ++p)
			;
		if (*p == '\0')
			break;
		for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e)
			;
		/*
		 * Only give parse_db_loc 1 entry, so stuff a null into
		 * the string if we're not at the end.  We need to save this
		 * char and restore it after call.
		 */
		restore_space = '\0';
		if (*e != '\0') {
			restore_space = *e;
			*e = '\0';
		}
		if (parse_db_loc(p, cl) != 0) {
			cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p);
		} else {
			(void) ridev(
			    &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip,
			    cl, NULL, MDDB_F_PTCHED);
			if (cl->l_devid_flags & MDDB_DEVID_SPACE) {
				kmem_free((caddr_t)(uintptr_t)cl->l_devid,
				    cl->l_devid_sz);
			}
		}
		if (restore_space != '\0') {
			*e = restore_space;
		}
		p = e;
	}
	kmem_free(cl, sizeof (mddb_cfg_loc_t));
}

/*
 * grab database locations supplied by md.conf as properties
 */
static void
parse_db_strings(void)
{
	int		bootlist_id;
	int		proplen;
	/*
	 * size of _bootlist_name should match uses of line and entry in
	 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c)
	 */
	char 		_bootlist_name[MDDB_BOOTLIST_MAX_LEN];
	char		*bootlist_name;
	caddr_t		prop;

/*
 * Step through the bootlist properties one at a time by forming the
 * correct name, fetching the property, parsing the property and
 * then freeing the memory.  If a property does not exist or returns
 * some form of error just ignore it.  There is no guarantee that
 * the properties will always exist in sequence, for example
 * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with
 * mddb_bootlist3 existing.
 */
	bootlist_name = &_bootlist_name[0];
	for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) {

		proplen = 0;
		(void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id);

		if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo,
		    DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop,
		    &proplen) != DDI_PROP_SUCCESS)
			continue;

		if (proplen <= 0)
			continue;

		if (md_init_debug)
			cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop);

		parse_db_string(prop);
		kmem_free(prop, proplen);
	}
}

static int
initit(
	set_t		setno,
	int		flag
)
{
	int		i;
	mddb_set_t	*s;
	mddb_lb_t	*lbp;		/* pointer to locator block */
	mddb_ln_t	*lnp;		/* pointer to locator names */
	mddb_db_t	*dbp;		/* pointer to directory block */
	mddb_did_blk_t	*did_blkp;	/* pointer to Device ID block */
	mddb_did_ic_t	*did_icp;	/* pointer to Device ID incore area */
	mddb_bf_t	*bfp;
	side_t		sideno;
	side_t		maxsides;
	mddb_block_t	lb_blkcnt;
	int		retval = 0;
	md_dev64_t	dev;
	mddb_mnlb_t	*mnlbp;
	int		devid_flag;

	/* single thread's all loads/unloads of set's */
	mutex_enter(&mddb_lock);
	mutex_enter(SETMUTEX(setno));

	if (((mddb_set_t *)md_set[setno].s_db) == NULL) {
		mutex_exit(SETMUTEX(setno));
		mutex_exit(&mddb_lock);
		return (MDDB_E_NOTNOW);
	}

	s = (mddb_set_t *)md_set[setno].s_db;

	single_thread_start(s);

	/*
	 * init is already underway, block. Return success.
	 */
	if (s->s_lbp) {
		single_thread_end(s);
		mutex_exit(SETMUTEX(setno));
		mutex_exit(&mddb_lock);
		return (0);
	}

	uniqtime32(&s->s_inittime);

	/* grab database locations patched by /etc/system */
	if (setno == MD_LOCAL_SET)
		parse_db_strings();

	s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc(
	    sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP);

	s->s_zombie = 0;
	s->s_staledeletes = 0;
	s->s_optcmtcnt = 0;
	s->s_opthavelck = 0;
	s->s_optwantlck = 0;
	s->s_optwaiterr = 0;
	s->s_opthungerr = 0;

	/*
	 * KEEPTAG can never be set for a MN diskset since no tags are
	 * allowed to be stored in a MN diskset.  No way to check
	 * if this is a MN diskset or not at this point since the mddb
	 * hasn't been read in from disk yet.  (flag will only have
	 * MUTLINODE bit set if a new set is being created.)
	 */
	if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG))
		dt_setup(s, NULL);

	md_clr_setstatus(s->s_setno, MD_SET_TOOFEW);

	for (i = 0; i <	mddb_maxbufheaders; i++) {
		bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP);
		sema_init(&bfp->bf_buf.b_io, 0, NULL,
		    SEMA_DEFAULT, NULL);
		sema_init(&bfp->bf_buf.b_sem, 0, NULL,
		    SEMA_DEFAULT, NULL);
		bfp->bf_buf.b_offset = -1;
		freebuffer(s, bfp);
	}

	retval = load_old_replicas(s, flag);
	/* If 0 return value - success */
	if (! retval) {
		single_thread_end(s);
		mutex_exit(SETMUTEX(setno));
		mutex_exit(&mddb_lock);
		return (0);
	}

	/*
	 * If here, then the load_old_replicas() failed
	 */


	/* If the database was supposed to exist. */
	if (flag & MDDB_MUSTEXIST) {
		if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) {
			for (i = 0; i < mddb_maxcopies;	 i++) {
				if (! s->s_mbiarray[i])
					continue;
				dev = md_expldev(
				    s->s_lbp->lb_locators[i].l_dev);
				dev = md_xlate_targ_2_mini(dev);
				if (dev != NODEV64)
					mddb_devclose(dev);

				free_mbipp(&s->s_mbiarray[i]);
			}

			kmem_free((caddr_t)s->s_mbiarray,
			    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
			s->s_mbiarray = NULL;
		}

		if (s->s_lnp != (mddb_ln_t *)NULL) {
			kmem_free((caddr_t)s->s_lnp,
			    dbtob(s->s_lbp->lb_lnblkcnt));
			s->s_lnp = (mddb_ln_t *)NULL;
		}

		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);

		if (s->s_lbp != (mddb_lb_t *)NULL) {
			kmem_free((caddr_t)s->s_lbp,
			    dbtob(s->s_lbp->lb_blkcnt));
			s->s_lbp = (mddb_lb_t *)NULL;
		}

		while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
			kmem_free((caddr_t)bfp, sizeof (*bfp));

		single_thread_end(s);
		mutex_exit(SETMUTEX(setno));
		mutex_exit(&mddb_lock);

		if (retval == MDDB_E_TAGDATA)
			return (retval);

		/* Want a bit more detailed error messages */
		if (mddb_db_err_detail)
			return (retval);

		return (MDDB_E_NODB);
	}


	/*
	 * MDDB_NOOLDOK set - Creating a new database, so do
	 * more initialization.
	 */

	lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
	    MDDB_LOCAL_LBCNT : MDDB_LBCNT);
	if (flag & MDDB_MULTINODE) {
		lb_blkcnt = MDDB_MNLBCNT;
	}

	if (s->s_lbp == NULL)
		s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP);
	lbp = s->s_lbp;

	bzero((caddr_t)lbp, dbtob(lb_blkcnt));
	lbp->lb_setno = setno;
	lbp->lb_magic = MDDB_MAGIC_LB;
	if (flag & MDDB_MULTINODE) {
		lbp->lb_revision = MDDB_REV_MNLB;
	} else {
		lbp->lb_revision = MDDB_REV_LB;
	}
	lbp->lb_inittime = s->s_inittime;
	if (flag & MDDB_MULTINODE) {
		mnlbp = (mddb_mnlb_t *)lbp;
		for (i = 0; i < MDDB_NLB; i++) {
			for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) {
				mddb_mnsidelocator_t	*mnslp;
				mnslp = &mnlbp->lb_mnsidelocators[sideno][i];
				mnslp->mnl_mnum = NODEV32;
				mnslp->mnl_sideno = 0;
				mnslp->mnl_drvnm_index = 0;
			}
		}
	} else {
		maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES);
		for (i = 0; i < MDDB_NLB; i++) {
			for (sideno = 0; sideno < maxsides; sideno++) {
				mddb_sidelocator_t	*slp;
				slp = &lbp->lb_sidelocators[sideno][i];
				slp->l_mnum = NODEV32;
			}
		}
	}
	lbp->lb_blkcnt = lb_blkcnt;

	/* lb starts on block 0 */
	/* locator names starts after locator block */
	lbp->lb_lnfirstblk = lb_blkcnt;
	if (flag & MDDB_MULTINODE) {
		lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT;
	} else {
		lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
		    MDDB_LOCAL_LNCNT : MDDB_LNCNT);
	}

	if (flag & MDDB_MULTINODE) {
		/* Creating a multinode diskset */
		md_set_setstatus(setno, MD_SET_MNSET);
		lbp->lb_flags |= MDDB_MNSET;
	}

	/* Data portion of mddb located after locator names */
	lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt;

	/* the btodb that follows is converting the directory block size */
	/* Data tag part of mddb located after first block of mddb data */
	lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk +
	    btodb(MDDB_BSIZE));
	/* Data tags are not used in MN diskset - so set count to 0 */
	if (flag & MDDB_MULTINODE)
		lbp->lb_dtblkcnt = (mddb_block_t)0;
	else
		lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS;


	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP);
	lnp->ln_magic = MDDB_MAGIC_LN;
	if (flag & MDDB_MULTINODE) {
		lnp->ln_revision = MDDB_REV_MNLN;
	} else {
		lnp->ln_revision = MDDB_REV_LN;
	}
	s->s_lnp = lnp;

	/*
	 * Set up Device ID portion of Locator Block.
	 * Do not set locator to device id style if
	 * md_devid_destroy is 1 and md_keep_repl_state is 1
	 * (destroy all device id data and keep replica in
	 * non device id mode).
	 *
	 * This is logically equivalent to set locator to
	 * device id style if md_devid_destroy is 0 or
	 * md_keep_repl_state is 0.
	 *
	 * In SunCluster environment, device id mode is disabled
	 * which means diskset will be run in non-devid mode.  For
	 * localset, the behavior will remain intact and run in
	 * device id mode.
	 *
	 * In multinode diskset devids are turned off.
	 */
	devid_flag = 1;
	if (cluster_bootflags & CLUSTER_CONFIGURED)
		if (setno != MD_LOCAL_SET)
			devid_flag = 0;
	if (flag & MDDB_MULTINODE)
		devid_flag = 0;
	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
		devid_flag = 0;
	/*
	 * if we weren't devid style before and md_keep_repl_state=1
	 * we need to stay non-devid
	 */
	if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) &&
	    (md_keep_repl_state == 1))
		devid_flag = 0;
	if (devid_flag) {
		lbp->lb_didfirstblk = lbp->lb_dtfirstblk +
		    lbp->lb_dtblkcnt;
		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
		lbp->lb_flags |= MDDB_DEVID_STYLE;

		did_icp = (mddb_did_ic_t *)kmem_zalloc
		    (sizeof (mddb_did_ic_t), KM_SLEEP);
		did_blkp = (mddb_did_blk_t *)
		    kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
		did_blkp->blk_magic = MDDB_MAGIC_DI;
		did_blkp->blk_revision = MDDB_REV_DI;
		did_icp->did_ic_blkp = did_blkp;
		s->s_did_icp = did_icp;
	}

	setidentifier(s, &lbp->lb_ident);
	uniqtime32(&lbp->lb_timestamp);
	dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
	dbp->db_magic = MDDB_MAGIC_DB;
	dbp->db_revision = MDDB_REV_DB;
	uniqtime32(&dbp->db_timestamp);
	dbp->db_nextblk = 0;
	dbp->db_firstentry = NULL;
	dbp->db_blknum = lbp->lb_dbfirstblk;
	dbp->db_recsum = MDDB_GLOBAL_XOR;
	s->s_dbp = dbp;
	single_thread_end(s);
	mutex_exit(SETMUTEX(setno));
	mutex_exit(&mddb_lock);
	return (0);
}

mddb_set_t *
mddb_setenter(
	set_t		setno,
	int		flag,
	int		*errorcodep
)
{
	mddb_set_t	*s;
	int		err = 0;
	size_t		sz = sizeof (void *) * MD_MAXUNITS;

	mutex_enter(SETMUTEX(setno));
	if (! md_set[setno].s_db) {
		mutex_exit(SETMUTEX(setno));
		if (errorcodep != NULL)
			*errorcodep = MDDB_E_NOTOWNER;
		return (NULL);
	}

	/* Allocate s_un and s_ui arrays if not already present. */
	if (md_set[setno].s_un == NULL) {
		md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP);
		if (md_set[setno].s_un == NULL) {
			mutex_exit(SETMUTEX(setno));
			if (errorcodep != NULL)
				*errorcodep = MDDB_E_NOTOWNER;
			return (NULL);
		}
	}
	if (md_set[setno].s_ui == NULL) {
		md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP);
		if (md_set[setno].s_ui == NULL) {
			mutex_exit(&md_set[setno].s_dbmx);
			kmem_free(md_set[setno].s_un, sz);
			md_set[setno].s_un = NULL;
			if (errorcodep != NULL)
				*errorcodep = MDDB_E_NOTOWNER;
			return (NULL);
		}
	}
	s = (mddb_set_t *)md_set[setno].s_db;
	if (s->s_lbp)
		return (s);

	if (flag & MDDB_NOINIT)
		return (s);

	/*
	 * Release the set mutex - it will be acquired and released in
	 * initit after acquiring the mddb_lock.  This is done to assure
	 * that mutexes are always acquired in the same order to prevent
	 * possible deadlock
	 */
	mutex_exit(SETMUTEX(setno));

	if ((err = initit(setno, flag)) != 0) {
		if (errorcodep != NULL)
			*errorcodep = err;
		return (NULL);
	}

	mutex_enter(SETMUTEX(setno));
	return ((mddb_set_t *)md_set[setno].s_db);
}

/*
 * Release the set lock for a given set.
 *
 * In a MN diskset, this routine may send messages to the rpc.mdcommd
 * in order to have the slave nodes re-parse parts of the mddb.
 * Messages are only sent if the global ioctl lock is not held.
 *
 * With the introduction of multi-threaded ioctls, there is no way
 * to determine which thread(s) are holding the ioctl lock.  So, if
 * the ioctl lock is held (by process X) process X will send the
 * messages to the slave nodes when process X releases the ioctl lock.
 */
void
mddb_setexit(
	mddb_set_t	*s
)
{
	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
	md_mn_kresult_t			*kresult;
	mddb_lb_t			*lbp = s->s_lbp;
	int				i;
	int				rval = 1;

	/*
	 * If not a MN diskset OR
	 * a MN diskset but this node isn't master,
	 * then release the mutex.
	 */
	if (!(MD_MNSET_SETNO(s->s_setno)) ||
	    ((MD_MNSET_SETNO(s->s_setno)) &&
	    (!md_set[s->s_setno].s_am_i_master))) {
		mutex_exit(SETMUTEX(s->s_setno));
		return;
	}

	/*
	 * If global ioctl lock is held, then send no messages,
	 * just release mutex and return.
	 *
	 */
	if (md_status & MD_GBL_IOCTL_LOCK) {
		mutex_exit(SETMUTEX(s->s_setno));
		return;
	}

	/*
	 * This thread is not holding the ioctl lock, so drop the set
	 * lock, send messages to slave nodes to reparse portions
	 * of the mddb and return.
	 *
	 * If the block parse flag is set, do not send parse messages.
	 * This flag is set when master is adding a new mddb that would
	 * cause parse messages to be sent to the slaves, but the slaves
	 * don't have knowledge of the new mddb yet since the mddb add
	 * operation hasn't been run on the slave nodes yet.  When the
	 * master unblocks the parse flag, the parse messages will be
	 * generated.
	 *
	 * If s_mn_parseflags_sending is non-zero, then another thread
	 * is already currently sending a parse message, so just release
	 * the mutex and return.  If an mddb change occurred that results
	 * in a parse message to be generated, the thread that is currently
	 * sending a parse message would generate the additional parse message.
	 *
	 * If s_mn_parseflags_sending is zero and parsing is not blocked,
	 * then loop until s_mn_parseflags is 0 (until there are no more
	 * messages to send).
	 * While s_mn_parseflags is non-zero,
	 * 	put snapshot of parse_flags in s_mn_parseflags_sending
	 * 	set s_mn_parseflags to zero
	 *	release mutex
	 *	send message
	 *	re-grab mutex
	 *	set s_mn_parseflags_sending to zero
	 */
	mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP);
	while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
	    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
	    (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) {
		/* Grab snapshot of parse flags */
		s->s_mn_parseflags_sending = s->s_mn_parseflags;
		s->s_mn_parseflags = 0;

		mutex_exit(SETMUTEX(s->s_setno));

		/*
		 * Send the message to the slaves to re-parse
		 * the indicated portions of the mddb. Send the status
		 * of the 50 mddbs in this set so that slaves know which
		 * mddbs that the master node thinks are 'good'.
		 * Otherwise, slave may reparse, but from wrong replica.
		 */
		mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending;
		for (i = 0; i < MDDB_NLB; i++) {
			mddb_parse_msg->msg_lb_flags[i] =
			    lbp->lb_locators[i].l_flags;
		}
		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
		while (rval != 0) {
			rval = mdmn_ksend_message(s->s_setno,
			    MD_MN_MSG_MDDB_PARSE, 0, 0,
			    (char *)mddb_parse_msg,
			    sizeof (md_mn_msg_mddb_parse_t), kresult);
			if (rval != 0)
				cmn_err(CE_WARN, "mddb_setexit: Unable to send "
				    "mddb update message to other nodes in "
				    "diskset %s\n", s->s_setname);
		}
		kmem_free(kresult, sizeof (md_mn_kresult_t));

		/*
		 * Re-grab mutex to clear sending field and to
		 * see if another parse message needs to be generated.
		 */
		mutex_enter(SETMUTEX(s->s_setno));
		s->s_mn_parseflags_sending = 0;
	}
	kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
	mutex_exit(SETMUTEX(s->s_setno));
}

static void
mddb_setexit_no_parse(
	mddb_set_t	*s
)
{
	mutex_exit(SETMUTEX(s->s_setno));
}

uint_t
mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt)
{
	uint_t			li;
	mddb_lb_t		*lbp = s->s_lbp;
	mddb_locator_t		*lp;
	ddi_devid_t		ret_devid;
	uint_t			devid_len;
	dev_t			ddi_dev;
	mddb_did_ic_t		*did_icp;
	mddb_did_blk_t		*did_blkp;
	char			*minor_name;
	size_t			sz;
	int			retval;
	int			err;
	md_dev64_t		dev64; /* tmp var to make code look better */


	/* Need disk block(s) to hold mddb_did_blk_t */
	*blk_cnt = MDDB_DID_BLOCKS;

	if (doit) {
		/*
		 * Alloc mddb_did_blk_t disk block and fill in header area.
		 * Don't fill in did magic number until end of routine so
		 * if machine panics in the middle of conversion, the
		 * device id information will be thrown away at the
		 * next snarfing of this set.
		 * Need to set DEVID_STYLE so that mddb_devid_add will
		 * function properly.
		 */
		/* grab the mutex */
		if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) {
			return (1);
		}
		single_thread_start(s);
		lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS);
		if (lbp->lb_didfirstblk == 0) {
			single_thread_end(s);
			mddb_setexit(s);
			return (1);
		}
		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
		did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t),
		    KM_SLEEP);
		did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES,
		    KM_SLEEP);

		did_blkp->blk_revision = MDDB_REV_DI;
		did_icp->did_ic_blkp = did_blkp;
		s->s_did_icp = did_icp;
		lbp->lb_flags |= MDDB_DEVID_STYLE;
	}

	/* Fill in information in mddb_did_info_t array */
	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];
		if (lp->l_flags & MDDB_F_DELETED)
			continue;

		dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
		ddi_dev = md_dev64_to_dev(dev64);
		if (ddi_dev == NODEV) {
			/*
			 * No translation available for replica.
			 * Could fail conversion to device id replica,
			 * but instead will just continue with next
			 * replica in list.
			 */
			continue;
		}
		if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) {
			/*
			 * Just count each devid as at least 1 block.  This
			 * is conservative since several device id's may fit
			 * into 1 disk block, but it's better to overestimate
			 * the number of blocks needed than to underestimate.
			 */
			devid_len = (int)ddi_devid_sizeof(ret_devid);
			*blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1));
			if (doit) {
				if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK,
				    &minor_name) == DDI_SUCCESS) {
					if (mddb_devid_add(s, li, ret_devid,
					    minor_name)) {
						cmn_err(CE_WARN,
						    "Not enough space in metadb"
						    " to add device id for"
						    "  dev: major = %d, "
						    "minor = %d\n",
						    getmajor(ddi_dev),
						    getminor(ddi_dev));
					}
					sz = strlen(minor_name) + 1;
					kmem_free(minor_name, sz);
				}
			}
			ddi_devid_free(ret_devid);
		}
	}

	if (doit) {
		did_blkp->blk_magic = MDDB_MAGIC_DI;
		retval = push_lb(s);
		(void) upd_med(s, "mddb_lb_did_convert(0)");
		single_thread_end(s);
		mddb_setexit(s);
		if (retval != 0)
			return (1);
	}

	return (0);
}

static mddb_set_t *
init_set(
	mddb_config_t	*cp,
	int		flag,
	int		*errp
)
{
	mddb_set_t	*s;
	char		*setname = NULL;
	set_t		setno = MD_LOCAL_SET;
	side_t		sideno = 0;
	struct timeval32 *created = NULL;

	if (cp != NULL) {
		setname = cp->c_setname;
		setno = cp->c_setno;
		sideno = cp->c_sideno;
		created = &cp->c_timestamp;
	}

	if (setno >= MD_MAXSETS)
		return ((mddb_set_t *)NULL);

	if (md_set[setno].s_db)
		return (mddb_setenter(setno, flag, errp));

	s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP);

	cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL);
	cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL);

	s->s_setno = setno;
	s->s_sideno = sideno;
	if (setno == MD_LOCAL_SET) {
		(void) snprintf(s->s_ident.serial, sizeof (s->s_ident.serial),
		    "%u", zone_get_hostid(NULL));
	} else {
		s->s_ident.createtime = *created;
		s->s_setname = (char *)kmem_alloc(strlen(setname) + 1,
		    KM_SLEEP);
		(void) strcpy(s->s_setname, setname);
	}

	/* have a config struct,  copy mediator information */
	if (cp != NULL)
		s->s_med = cp->c_med;		/* structure assignment */

	md_set[setno].s_db = (void *) s;

	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64);

	return (mddb_setenter(setno, flag, errp));
}

void
mddb_unload_set(
	set_t		setno
)
{

	mddb_set_t	*s;
	mddb_db_t	*dbp, *adbp = NULL;
	mddb_de_ic_t	*dep, *dep2;
	mddb_bf_t	*bfp;
	int		i;
	md_dev64_t	dev;

	if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL)
		return;

	single_thread_start(s);

	s->s_opthavequeuinglck = 0;
	s->s_optwantqueuinglck = 0;

	for (dbp = s->s_dbp; dbp != 0; dbp = adbp) {
		for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) {
			if (dep->de_rb_userdata != NULL) {
				if (dep->de_icreqsize)
					kmem_free(dep->de_rb_userdata_ic,
					    dep->de_icreqsize);
				else
					kmem_free(dep->de_rb_userdata,
					    dep->de_reqsize);
			}
			kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
			dep2 = dep->de_next;
			kmem_free((caddr_t)dep, sizeofde(dep));
		}
		adbp = dbp->db_next;
		kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
	}
	s->s_dbp = (mddb_db_t *)NULL;

	free_rip(&s->s_rip);

	for (i = 0; i < mddb_maxcopies;	 i++) {
		if (! s->s_mbiarray)
			break;

		if (! s->s_mbiarray[i])
			continue;

		dev = md_expldev(s->s_lbp->lb_locators[i].l_dev);
		dev = md_xlate_targ_2_mini(dev);
		if (dev != NODEV64)
			mddb_devclose(dev);

		free_mbipp(&s->s_mbiarray[i]);
	}

	if (s->s_mbiarray) {
		kmem_free((caddr_t)s->s_mbiarray,
		    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
		s->s_mbiarray = (mddb_mb_ic_t **)NULL;
	}

	if (s->s_lnp) {
		kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt));
		s->s_lnp = (mddb_ln_t *)NULL;
	}

	if (s->s_lbp) {
		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
		kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt));
		s->s_lbp = (mddb_lb_t *)NULL;
	}

	if (s->s_freebitmap) {
		kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
		s->s_freebitmap = NULL;
		s->s_freebitmapsize = 0;
	}

	while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
		kmem_free((caddr_t)bfp, sizeof (*bfp));

	if (s->s_databuffer_size) {
		kmem_free(s->s_databuffer, s->s_databuffer_size);
		s->s_databuffer_size = 0;
	}

	if (s->s_setname != NULL)
		kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1);

	/* Data tags not supported on MN sets. */
	if (!(md_get_setstatus(setno) & MD_SET_MNSET))
		dtl_freel(&s->s_dtlp);

	md_set[setno].s_db = NULL;
	ASSERT(s->s_singlelockwanted == 0);
	kmem_free(s, sizeof (mddb_set_t));

	/* Take care of things setup in the md_set array */
	if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) {
		if (md_set[setno].s_dtp) {
			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
			md_set[setno].s_dtp = NULL;
		}
	}

	md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT |
	    MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE |
	    MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET |
	    MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC |
	    MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);

	mutex_exit(SETMUTEX(setno));
}

/*
 * returns 0 if name can be put into locator block
 * returns 1 if locator block prefixes are all used
 *
 * Takes splitname (suffix, prefix, sideno) and
 * stores it in the locator name structure.
 * For traditional diskset, the sideno is the index into the suffixes
 * array in the locator name structure.
 * For the MN diskset, the sideno is the nodeid which can be any number,
 * so the index passed in is the index into the mnsuffixes array
 * in the locator structure.  This index was computed by the
 * routine checklocator which basically checked the locator block
 * mnside locator structure.
 */
static int
splitname2locatorblock(
	md_splitname	*spn,
	mddb_ln_t	*lnp,
	int		li,
	side_t		sideno,
	int		index
)
{
	uchar_t			i;
	md_name_suffix		*sn;
	md_mnname_suffix_t	*mnsn;
	mddb_mnln_t		*mnlnp;

	for (i = 0; i < MDDB_PREFIXCNT; i++) {
		if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len)
			continue;
		if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data,
		    SPN_PREFIX(spn).pre_len) == 0)
			break;
	}
	if (i == MDDB_PREFIXCNT) {
		for (i = 0; i < MDDB_PREFIXCNT; i++) {
			if (lnp->ln_prefixes[i].pre_len == 0)
				break;
		}
		if (i == MDDB_PREFIXCNT)
			return (1);
		bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data,
		    SPN_PREFIX(spn).pre_len);
		lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len;
	}

	if (lnp->ln_revision == MDDB_REV_MNLN) {
		/* If a MN diskset, use index */
		mnlnp = (mddb_mnln_t *)lnp;
		mnsn = &mnlnp->ln_mnsuffixes[index][li];
		mnsn->mn_ln_sideno = sideno;
		mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len;
		mnsn->mn_ln_suffix.suf_prefix = i;
		bcopy(SPN_SUFFIX(spn).suf_data,
		    mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len);
	} else {
		sn = &lnp->ln_suffixes[sideno][li];
		sn->suf_len = SPN_SUFFIX(spn).suf_len;
		sn->suf_prefix = i;
		bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data,
		    SPN_SUFFIX(spn).suf_len);
	}
	return (0);
}

/*
 * Find the locator name for the given sideno and convert the locator name
 * information into a splitname structure.
 */
void
mddb_locatorblock2splitname(
	mddb_ln_t	*lnp,
	int		li,
	side_t		sideno,
	md_splitname	*spn
)
{
	int			iprefix;
	md_name_suffix		*sn;
	md_mnname_suffix_t	*mnsn;
	int			i;
	mddb_mnln_t		*mnlnp;

	if (lnp->ln_revision == MDDB_REV_MNLN) {
		mnlnp = (mddb_mnln_t *)lnp;
		for (i = 0; i < MD_MNMAXSIDES; i++) {
			mnsn = &mnlnp->ln_mnsuffixes[i][li];
			if (mnsn->mn_ln_sideno == sideno)
				break;
		}
		if (i == MD_MNMAXSIDES)
			return;

		SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len;
		bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data,
		    SPN_SUFFIX(spn).suf_len);
		iprefix = mnsn->mn_ln_suffix.suf_prefix;
	} else {
		sn = &lnp->ln_suffixes[sideno][li];
		SPN_SUFFIX(spn).suf_len = sn->suf_len;
		bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data,
		    SPN_SUFFIX(spn).suf_len);
		iprefix = sn->suf_prefix;
	}
	SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len;
	bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data,
	    SPN_PREFIX(spn).pre_len);
}

static int
getdeldev(
	mddb_config_t	*cp,
	int		command,
	md_error_t	*ep
)
{
	mddb_set_t	*s;
	mddb_lb_t	*lbp;
	mddb_locator_t	*locators;
	uint_t		loccnt;
	mddb_mb_ic_t	*mbip;
	mddb_block_t	blk;
	int		err = 0;
	int		i, j;
	int		li;
	uint_t		commitcnt;
	set_t		setno = cp->c_setno;
	uint_t		set_status;
	md_dev64_t	dev;
	int		flags = MDDB_MUSTEXIST;
	mddb_ri_t	*rip;

	cp->c_dbmax = MDDB_NLB;

	/*
	 * Data checking
	 */
	if (setno >= md_nsets || cp->c_id < 0 ||
	    cp->c_id > cp->c_dbmax) {
		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
	}

	if (cp->c_flags & MDDB_C_STALE)
		flags |= MDDB_MN_STALE;

	if ((s = mddb_setenter(setno, flags, &err)) == NULL)
		return (mddbstatus2error(ep, err, NODEV32, setno));

	cp->c_flags = 0;

	lbp = s->s_lbp;
	loccnt = lbp->lb_loccnt;
	locators = lbp->lb_locators;

	/* shorthand */
	set_status = md_get_setstatus(setno);

	if (set_status & MD_SET_STALE)
		cp->c_flags |= MDDB_C_STALE;

	if (set_status & MD_SET_TOOFEW)
		cp->c_flags |= MDDB_C_TOOFEW;

	cp->c_sideno = s->s_sideno;

	cp->c_dbcnt = 0;
	/*
	 * go through and count active entries
	 */
	for (i = 0; i < loccnt;	 i++) {
		if (locators[i].l_flags & MDDB_F_DELETED)
			continue;
		cp->c_dbcnt++;
	}

	/*
	 * add the ability to accept a locator block index
	 * which is not relative to previously deleted replicas.  This
	 * is for support of MD_DEBUG=STAT in metastat since it asks for
	 * replica information specifically for each of the mirror resync
	 * records.  MDDB_CONFIG_SUBCMD uses one of the pad spares in
	 * the mddb_config_t type.
	 */
	if (cp->c_subcmd == MDDB_CONFIG_ABS) {
		if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) {
			mddb_setexit(s);
			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
			    setno));
		}
		li = cp->c_id;
	} else {
		if (cp->c_id >= cp->c_dbcnt) {
			mddb_setexit(s);
			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
			    setno));
		}

		/* CSTYLED */
		for (li = 0, j = 0; /* void */; li++) {
			if (locators[li].l_flags & MDDB_F_DELETED)
				continue;
			j++;
			if (j > cp->c_id)
				break;
		}
	}

	if (command == MDDB_ENDDEV) {
		daddr_t ib = 0, jb;

		blk = 0;
		if ((s != NULL) && s->s_mbiarray[li]) {
			mbip = s->s_mbiarray[li];
			while ((jb = getphysblk(blk++, mbip)) > 0) {
				if (jb > ib)
					ib = jb;
			}
			cp->c_dbend = (int)ib;
		} else {
			cp->c_dbend = 0;
		}
	}

	locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp);
	mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname);

	if (command != MDDB_DELDEV) {
		mddb_setexit(s);
		return (0);
	}

	/* Currently don't allow addition/deletion of sides during upgrade */
	if (MD_UPGRADE) {
		cmn_err(CE_WARN,
		    "Deletion of replica not allowed during upgrade.\n");
		mddb_setexit(s);
		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
	}

	/*
	 * If here, replica delete in progress.
	 */
	single_thread_start(s);

	if ((! (locators[li].l_flags & MDDB_F_EMASTER)) &&
	    (locators[li].l_flags & MDDB_F_ACTIVE)) {
		commitcnt = lbp->lb_commitcnt;
		lbp->lb_commitcnt = 0;
		setidentifier(s, &lbp->lb_ident);
		crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
		/*
		 * Don't need to write out device id area, since locator
		 * block on this replica is being deleted by setting the
		 * commitcnt to 0.
		 */
		(void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
		    MDDB_WR_ONLY_MASTER);
		lbp->lb_commitcnt = commitcnt;
	}

	if (s->s_mbiarray[li]) {
		/* A freed mbi pointer still exists in the mddb_ri_t */
		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
			if (rip->ri_mbip == s->s_mbiarray[li])
				rip->ri_mbip = NULL;
		}
		free_mbipp(&s->s_mbiarray[li]);
	}

	if (! (locators[li].l_flags & MDDB_F_EMASTER)) {
		dev = md_expldev(locators[li].l_dev);
		dev = md_xlate_targ_2_mini(dev);
		if (dev != NODEV64)
			mddb_devclose(dev);
	}

	s->s_mbiarray[li] = 0;
	lbp->lb_locators[li].l_flags = MDDB_F_DELETED;

	/* Only support data tags for traditional and local sets */
	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
	    (!(lbp->lb_flags & MDDB_MNSET)) &&
	    setno != MD_LOCAL_SET)
		if (set_dtag(s, ep))
			mdclrerror(ep);

	/* Write data tags to all accessible devices */
	/* Only support data tags for traditional and local sets */
	if (!(lbp->lb_flags & MDDB_MNSET)) {
		(void) dt_write(s);
	}

	/* Delete device id of deleted replica */
	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
		(void) mddb_devid_delete(s, li);
	}
	/* write new locator to all devices */
	err = writelocall(s);

	(void) upd_med(s, "getdeldev(0)");

	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno,
	    md_expldev(locators[li].l_dev));

	computefreeblks(s); /* recompute always it may be larger */
	cp->c_dbcnt--;
	err |= fixoptrecords(s);
	if (err) {
		if (writeretry(s)) {
			single_thread_end(s);
			mddb_setexit(s);
			return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno));
		}
	}

	single_thread_end(s);
	mddb_setexit(s);
	return (0);
}

static int
getdriver(
	mddb_cfg_loc_t	*clp
)
{
	major_t		majordev;

	/*
	 * Data checking
	 */
	if (clp->l_dev <= 0)
		return (EINVAL);

	majordev = getmajor(expldev(clp->l_dev));

	if (ddi_major_to_name(majordev) == (char *)NULL)
		return (EINVAL);

	if (MD_UPGRADE)
		(void) strcpy(clp->l_driver, md_targ_major_to_name(majordev));
	else
		(void) strcpy(clp->l_driver, ddi_major_to_name(majordev));
	return (0);
}

/*
 * update_valid_replica - updates the locator block namespace (prefix
 * 	and/or suffix) with new pathname and devname.
 *	RETURN
 *		1	Error
 *		0	Success
 */
static int
update_valid_replica(
	side_t		side,
	mddb_locator_t	*lp,
	mddb_set_t	*s,
	int		li,
	char		*devname,
	char		*pathname,
	md_dev64_t	devt
)
{
	uchar_t		pre_len, suf_len;
	md_name_suffix	*sn;
	mddb_ln_t	*lnp;
	uchar_t		pre_index;
	uchar_t		i;

	if (md_expldev(lp->l_dev) != devt) {
		return (0);
	}

	if (pathname[strlen(pathname) - 1] == '/')
		pathname[strlen(pathname) - 1] = '\0';

	pre_len = (uchar_t)strlen(pathname);
	suf_len = (uchar_t)strlen(devname);

	if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX))
		return (1);

	lnp = s->s_lnp;

	/*
	 * Future note:  Need to do something here for the MN diskset case
	 * when device ids are supported in disksets.
	 * Can't add until merging devids_in_diskset code into code base
	 * Currently only called with side of 0.
	 */

	sn = &lnp->ln_suffixes[side][li];

	/*
	 * Check if prefix (Ex: /dev/dsk) needs to be changed.
	 * If new prefix is the same as the previous prefix - no change.
	 *
	 * If new prefix is not the same, check if new prefix
	 * matches an existing one.  If so, use that one.
	 *
	 * If new prefix doesn't exist, add a new prefix.  If not enough
	 * space, return failure.
	 */
	pre_index = sn->suf_prefix;
	/* Check if new prefix is the same as the old prefix. */
	if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) ||
	    (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname,
	    pre_len) != 0)) {
		/* Check if new prefix is an already known prefix. */
		for (i = 0; i < MDDB_PREFIXCNT; i++) {
			if (lnp->ln_prefixes[i].pre_len != pre_len) {
				continue;
			}
			if (bcmp(lnp->ln_prefixes[i].pre_data, pathname,
			    pre_len) == 0) {
				break;
			}
		}
		/* If no match found for new prefix - add the new prefix */
		if (i == MDDB_PREFIXCNT) {
			for (i = 0; i < MDDB_PREFIXCNT; i++) {
				if (lnp->ln_prefixes[i].pre_len == 0)
					break;
			}
			/* No space to add new prefix - return failure */
			if (i == MDDB_PREFIXCNT) {
				return (1);
			}
			bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len);
			lnp->ln_prefixes[i].pre_len = pre_len;
		}
		sn->suf_prefix = i;
	}

	/* Now, update the suffix (Ex: c0t0d0s0) if needed */
	if ((sn->suf_len != suf_len) ||
	    (bcmp(sn->suf_data, devname, suf_len) != 0)) {
		bcopy(devname, sn->suf_data, suf_len);
		sn->suf_len = suf_len;
	}
	return (0);
}


/*
 * md_update_locator_namespace - If in devid style and active and the devid's
 *		exist and are valid update the locator namespace pathname
 *		and devname.
 *	RETURN
 *		1	Error
 *		0	Success
 */
int
md_update_locator_namespace(
	set_t		setno,		/* which set to get name from */
	side_t		side,
	char		*dname,
	char		*pname,
	md_dev64_t	devt
)
{
	mddb_set_t	*s;
	mddb_lb_t	*lbp;
	int		li;
	uint_t		flg;
	int		err = 0;
	mddb_ln_t	*lnp;

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
		return (1);
	single_thread_start(s);
	lbp = s->s_lbp;
	/* must be DEVID_STYLE */
	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
		for (li = 0; li < lbp->lb_loccnt; li++) {
			mddb_locator_t *lp = &lbp->lb_locators[li];

			if (lp->l_flags & MDDB_F_DELETED) {
				continue;
			}

			/* replica also must be active */
			if (lp->l_flags & MDDB_F_ACTIVE) {
				flg = s->s_did_icp->did_ic_blkp->
				    blk_info[li].info_flags;
				/* only update if did exists and is valid */
				if ((flg & MDDB_DID_EXISTS) &&
				    (flg & MDDB_DID_VALID)) {
					if (update_valid_replica(side, lp, s,
					    li, dname, pname, devt)) {
						err = 1;
						goto out;
					}
				}
			}
		}
	}
	lnp = s->s_lnp;
	uniqtime32(&lnp->ln_timestamp);
	if (lbp->lb_flags & MDDB_MNSET)
		lnp->ln_revision = MDDB_REV_MNLN;
	else
		lnp->ln_revision = MDDB_REV_LN;
	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
	err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
	    lbp->lb_lnblkcnt, 0);
	/*
	 * If a MN diskset and this is the master, set the PARSE_LOCNM
	 * flag in the mddb_set structure to show that the locator
	 * names have changed.
	 */

	if ((lbp->lb_flags & MDDB_MNSET) &&
	    (md_set[s->s_setno].s_am_i_master)) {
		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
	}
out:
	single_thread_end(s);
	mddb_setexit(s);
	if (err)
		return (1);
	return (0);
}

/*
 * update_locatorblock - for active entries in the locator block, check
 *		the devt to see if it matches the given devt. If so, and
 *		there is an associated device id which is not the same
 *		as the passed in devid, delete old devid and add a new one.
 *
 *		During import of replicated disksets, old_didptr contains
 *		the original disk's device id.  Use this device id in
 *		addition to the devt to determine if an entry is a match
 *		and should be updated with the new device id of the
 *		replicated disk.  Specifically, this is the case being handled:
 *
 *		Original_disk	Replicated_disk	Disk_Available_During_Import
 *		c1t1d0		c1t3d0		no - so old name c1t1d0 shown
 *		c1t2d0		c1t1d0		yes - name is c1t1d0
 *		c1t3d0		c1t2d0		yes - name is c1t2d0
 *
 *		Can't just match on devt since devt for the first and third
 *		disks will be the same, but the original disk's device id
 *		is known and can be used to distinguish which disk's
 *		replicated device id should be updated.
 *	RETURN
 *		MDDB_E_NODEVID
 *		MDDB_E_NOLOCBLK
 *		1	Error
 *		0	Success
 */
static int
update_locatorblock(
	mddb_set_t	*s,
	md_dev64_t	dev,
	ddi_devid_t	didptr,
	ddi_devid_t	old_didptr
)
{
	mddb_lb_t	*lbp = NULL;
	mddb_locator_t	*lp;
	int		li;
	uint_t		flg;
	ddi_devid_t	devid_ptr;
	int		retval = 0;
	char		*minor_name;
	int		repl_import_flag;

	/* Set replicated flag if this is a replicated import */
	repl_import_flag = md_get_setstatus(s->s_setno) &
	    MD_SET_REPLICATED_IMPORT;

	lbp = s->s_lbp;
	/* find replicas that haven't been deleted */
	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];

		if ((lp->l_flags & MDDB_F_DELETED)) {
			continue;
		}
		/*
		 * check to see if locator devt matches given dev
		 * and if there is a device ID associated with it
		 */
		flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags;
		if ((md_expldev(lp->l_dev) == dev) &&
		    (flg & MDDB_DID_EXISTS)) {
			if (flg & MDDB_DID_VALID) {
				continue; /* cont to nxt active entry */
			}
			devid_ptr = s->s_did_icp->did_ic_devid[li];
			if (devid_ptr == NULL) {
				return (MDDB_E_NODEVID);
			}

			/*
			 * During a replicated import the old_didptr
			 * must match the current devid before the
			 * devid can be updated.
			 */
			if (repl_import_flag) {
				if (ddi_devid_compare(devid_ptr,
				    old_didptr) != 0)
					continue;
			}

			if (ddi_devid_compare(devid_ptr, didptr) != 0) {
				/*
				 * devid's not equal so
				 * delete and add
				 */
				if (ddi_lyr_get_minor_name(
				    md_dev64_to_dev(dev),
				    S_IFBLK, &minor_name) == DDI_SUCCESS) {
					(void) mddb_devid_delete(s, li);
					(void) mddb_devid_add(s, li, didptr,
					    minor_name);
					kmem_free(minor_name,
					    strlen(minor_name)+1);
					break;
				} else {
					retval = 1;
					goto err_out;
				}
			}
		}
	} /* end for */
	retval = push_lb(s);
	(void) upd_med(s, "update_locatorblock(0)");
err_out:
	return (retval);
}

static int
update_mb_devid(
	mddb_set_t	*s,
	mddb_ri_t	*rip,
	ddi_devid_t	devidptr
)
{
	mddb_mb_ic_t	*mbip;
	mddb_mb_t	*mb = NULL;
	daddr_t		blkno;
	md_dev64_t	device;
	uint_t		sz;
	int		mb2free = 0;
	int		err = 0;


	/*
	 * There is case where a disk may not have mddb,
	 * and only has dummy mddb which contains
	 * a valid devid we like to update and in this
	 * case, the rip_lbp will be NULL but we still
	 * like to update the devid embedded in the
	 * dummy mb block.
	 *
	 */
	if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) {
		mbip = rip->ri_mbip;
		mb = &mbip->mbi_mddb_mb;
	} else {
		/*
		 * Done if it is non-replicated set
		 */
		if (devidptr != (ddi_devid_t)NULL) {
			mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE,
			    KM_SLEEP);
			mb->mb_magic = MDDB_MAGIC_DU;
			mb->mb_revision = MDDB_REV_MB;
			mb2free = 1;
		} else {
			goto out;
		}
	}

	blkno = rip->ri_blkno;
	device = rip->ri_dev;
	/*
	 * Replace the mb_devid with the new/valid one
	 */
	if (devidptr != (ddi_devid_t)NULL) {
		/*
		 * Zero out what we have previously
		 */
		if (mb->mb_devid_len)
			bzero(mb->mb_devid, mb->mb_devid_len);
		sz = ddi_devid_sizeof(devidptr);
		bcopy((char *)devidptr, (char *)mb->mb_devid, sz);
		mb->mb_devid_len = sz;
	}

	mb->mb_setno = s->s_setno;
	uniqtime32(&mb->mb_timestamp);
	crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
	/*
	 * putblks will
	 *
	 *	- drop the s_dbmx lock
	 *	- biowait
	 *	- regain the s_dbmx lock
	 *
	 * Need to update this if we wants to handle
	 * mb_next != NULL which it is unlikely will happen
	 */
	err = putblks(s, (caddr_t)mb, blkno, 1, device, 0);

	if (mb2free) {
		kmem_free(mb, MDDB_BSIZE);
	}
out:
	return (err);
}

static int
setdid(
	mddb_config_t		*cp
)
{
	ddi_devid_t		devidp;
	dev_t			ddi_dev;
	mddb_set_t		*s;
	int			err = 0;
	mddb_ri_t		*rip;

	/*
	 * Data integrity check
	 */
	if (cp->c_setno >= md_nsets || cp->c_devt <= 0)
		return (EINVAL);

	if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE))
		return (0);

	ddi_dev = md_dev64_to_dev(cp->c_devt);
	if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) {
		return (-1);
	}
	if (devidp == NULL) {
		return (-1);
	}

	if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL)
		return (-1);
	single_thread_start(s);

	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
		if (rip->ri_lbp == (mddb_lb_t *)NULL)
			continue;
		/*
		 * We only update what is asked
		 */
		if (rip->ri_dev == cp->c_devt) {
			if (update_mb_devid(s, rip, devidp) != 0) {
				err = -1;
				goto out;
			}
		}
	}

	if (update_locatorblock(s, cp->c_devt, devidp, NULL)) {
		err = -1;
		goto out;
	}

out:
	single_thread_end(s);
	mddb_setexit(s);
	ddi_devid_free(devidp);
	return (err);
}

static int
delnewside(
	mddb_config_t		*cp,
	int			command,
	md_error_t		*ep
)
{
	mddb_set_t		*s;
	int			li;
	mddb_lb_t		*lbp;		/* pointer to locator block */
	mddb_ln_t		*lnp;		/* pointer to locator names */
	mddb_mnln_t		*mnlnp;		/* pointer to locator names */
	mddb_locator_t		*lp;
	mddb_sidelocator_t	*slp;
	mddb_cfg_loc_t		*clp;
	int			err = 0;
	set_t			setno = cp->c_setno;
	ddi_devid_t		devid;
	ddi_devid_t		ret_devid = NULL;
	char			*minor_name;
	uint_t			use_devid = 0;
	dev_t			ddi_dev;
	md_mnname_suffix_t	*mnsn;
	mddb_mnlb_t		*mnlbp;
	mddb_mnsidelocator_t	*mnslp;

	/* Currently don't allow addition/deletion of sides during upgrade */
	if (MD_UPGRADE) {
		cmn_err(CE_WARN,
		    "Addition and deletion of sides not allowed"
		    " during upgrade. \n");
		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
	}

	/*
	 * Data integrity check
	 */
	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
		return (mddbstatus2error(ep, err, NODEV32, setno));

	single_thread_start(s);
	clp = &cp->c_locator;

	lbp = s->s_lbp;

	if (lbp->lb_setno != setno) {
		single_thread_end(s);
		mddb_setexit(s);
		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
	}

	/*
	 * Find this device/blkno pair
	 */
	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
		ddi_dev = md_dev64_to_dev(clp->l_dev);
		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
		    (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name)
		    == DDI_SUCCESS)) {
			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
				use_devid = 1;
				(void) strcpy(clp->l_minor_name, minor_name);
			}
			kmem_free(minor_name, strlen(minor_name)+1);
		}
		if (use_devid != 1 && ret_devid != NULL)
			ddi_devid_free(ret_devid);
	}
	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];
		if (lp->l_flags & MDDB_F_DELETED)
			continue;
		if (use_devid) {
			if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0)
				continue;
			if ((ddi_devid_compare(devid,
			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
				break;
			}
		} else {
			if (lp->l_dev == clp->l_dev &&
			    (daddr_t)lp->l_blkno == clp->l_blkno) {
				break;
			}
		}
	}

	if (li == lbp->lb_loccnt) {
		if (use_devid)
			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
		single_thread_end(s);
		mddb_setexit(s);
		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
	}

	lnp = s->s_lnp;
	if (command == MDDB_NEWSIDE) {
		int 	index = 0;
		/*
		 * If a MN diskset, need to find the index where the new
		 * locator information is to be stored in the mnsidelocator
		 * field of the locator block so that the locator name can
		 * be stored at the same array index in the mnsuffixes
		 * field of the locator names structure.
		 */
		if (lbp->lb_flags & MDDB_MNSET) {
			if ((index = checklocator(lbp, li,
			    cp->c_sideno)) == -1) {
				if (use_devid) {
					ddi_devid_free((ddi_devid_t)
					    (uintptr_t)clp->l_devid);
				}
				single_thread_end(s);
				mddb_setexit(s);
				return (mdmddberror(ep, MDE_DB_TOOSMALL,
				    NODEV32, setno));
			}
		}

		/*
		 * Store the locator name before the sidelocator information
		 * in case a panic occurs between these 2 steps.  Must have
		 * the locator name information in order to print reasonable
		 * error information.
		 */
		if (splitname2locatorblock(&cp->c_devname, lnp, li,
		    cp->c_sideno, index)) {
			if (use_devid)
				ddi_devid_free(
				    (ddi_devid_t)(uintptr_t)clp->l_devid);
			single_thread_end(s);
			mddb_setexit(s);
			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
			    setno));
		}

		if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) {
			if (use_devid)
				ddi_devid_free(
				    (ddi_devid_t)(uintptr_t)clp->l_devid);
			single_thread_end(s);
			mddb_setexit(s);
			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
			    setno));
		}
	}

	if (use_devid)
		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);

	if (command == MDDB_DELSIDE) {
		int i;
		for (i = 0; i < lbp->lb_loccnt; i++) {
			if (lbp->lb_flags & MDDB_MNSET) {
				int	j;
				mnlbp = (mddb_mnlb_t *)lbp;
				for (j = 0; j < MD_MNMAXSIDES; j++) {
					mnslp = &mnlbp->lb_mnsidelocators[j][i];
					if (mnslp->mnl_sideno == cp->c_sideno)
						break;
				}
				if (j < MD_MNMAXSIDES) {
					mnslp->mnl_mnum = NODEV32;
					mnslp->mnl_sideno = 0;
					mnlnp = (mddb_mnln_t *)lnp;
					mnsn = &(mnlnp->ln_mnsuffixes[j][i]);
					bzero((caddr_t)mnsn,
					    sizeof (md_mnname_suffix_t));
				}
			} else {
				slp = &lbp->lb_sidelocators[cp->c_sideno][i];
				bzero((caddr_t)&lnp->ln_suffixes
				    [cp->c_sideno][i], sizeof (md_name_suffix));
				slp->l_mnum = NODEV32;
			}
		}
	}

	/* write new locator names to all devices */
	uniqtime32(&lnp->ln_timestamp);
	if (lbp->lb_flags & MDDB_MNSET)
		lnp->ln_revision = MDDB_REV_MNLN;
	else
		lnp->ln_revision = MDDB_REV_LN;
	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
	    lbp->lb_lnblkcnt, 0);
	/*
	 * If a MN diskset and this is the master, set the PARSE_LOCNM
	 * flag in the mddb_set structure to show that the locator
	 * names have changed.
	 */

	if ((lbp->lb_flags & MDDB_MNSET) &&
	    (md_set[s->s_setno].s_am_i_master)) {
		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
	}
	if (err) {
		if (writeretry(s)) {
			single_thread_end(s);
			mddb_setexit(s);
			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
		}
	}

	uniqtime32(&lbp->lb_timestamp);
	/* write new locator to all devices */
	err = writelocall(s);

	(void) upd_med(s, "delnewside(0)");

	computefreeblks(s); /* recompute always it may be larger */
	if (err) {
		if (writeretry(s)) {
			single_thread_end(s);
			mddb_setexit(s);
			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
		}
	}

	single_thread_end(s);
	mddb_setexit(s);

	return (0);
}

static int
newdev(
	mddb_config_t	*cp,
	int		command,
	md_error_t	*ep
)
{
	mddb_set_t	*s;
	mddb_mb_ic_t	*mbip, *mbip1;
	int		i, j;
	int		li;
	mddb_lb_t	*lbp;		/* pointer to locator block */
	mddb_ln_t	*lnp;		/* pointer to locator names */
	mddb_locator_t	*lp;
	mddb_cfg_loc_t	*clp;
	int		err = 0;
	set_t		setno = cp->c_setno;
	ddi_devid_t	devid2;
	ddi_devid_t	ret_devid = NULL;
	char		*minor_name;
	uint_t		use_devid = 0;
	dev_t		ddi_dev;
	int		old_flags;
	int		flags;
	int		mn_set = 0;
	int		index;
	mddb_ri_t	*rip;
	int		locator_deleted = 0;
	dev32_t		locator_deleted_dev;
	int		sz = 0;


	/* Currently don't allow addition of new replica during upgrade */
	if (MD_UPGRADE) {
		cmn_err(CE_WARN,
		    "Addition of new replica not allowed during upgrade.\n");
		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
	}

	/*
	 * Data integrity check
	 */
	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

	/* Determine the flag settings for multinode sets */
	flags = MDDB_NOOLDOK;
	if (cp->c_multi_node)
		flags |= MDDB_MULTINODE;

	if ((s = mddb_setenter(setno, flags, &err)) == NULL) {
		if (err != MDDB_E_NOTOWNER)
			return (mddbstatus2error(ep, err, NODEV32, setno));
		s = init_set(cp, flags, &err);
		if (s == NULL)
			return (mddbstatus2error(ep, err, NODEV32, setno));
	}

	single_thread_start(s);

	/* shorthand */
	clp = &cp->c_locator;

	/* shorthand */
	lbp = s->s_lbp;

	if (lbp->lb_setno != setno) {
		single_thread_end(s);
		mddb_setexit(s);
		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
	}

	/*
	 * See if this device/blkno pair is already a replica
	 */
	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
		ddi_dev = expldev(clp->l_dev);
		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
		    (ddi_lyr_get_minor_name(ddi_dev,
		    S_IFBLK, &minor_name) == DDI_SUCCESS)) {
			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
				use_devid = 1;
				(void) strcpy(clp->l_minor_name, minor_name);
			}
			kmem_free(minor_name, strlen(minor_name)+1);
		}
		if (use_devid != 1 && ret_devid != NULL)
			ddi_devid_free(ret_devid);
	}

	for (i = 0; i < lbp->lb_loccnt;	 i++) {
		lp = &lbp->lb_locators[i];
		if (lp->l_flags & MDDB_F_DELETED)
			continue;
		if (use_devid) {
			if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0)
				continue;
			if ((ddi_devid_compare(devid2,
			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
				if (command == MDDB_NEWDEV) {
					ddi_devid_free((ddi_devid_t)(uintptr_t)
					    clp->l_devid);
					single_thread_end(s);
					mddb_setexit(s);
					return (mdmddberror(ep,
					    MDE_DB_EXISTS, NODEV32, setno));
				}
			}
		} else {
			if (lp->l_dev == clp->l_dev &&
			    (daddr_t)lp->l_blkno == clp->l_blkno) {
				if (command == MDDB_NEWDEV) {
					single_thread_end(s);
					mddb_setexit(s);
					return (mdmddberror(ep,
					    MDE_DB_EXISTS, NODEV32, setno));
				}
			}
		}
	}

	/*
	 * Really is a new replica, go get the master blocks
	 */
	mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno,
	    (uint_t *)0, &mn_set);
	if (! mbip) {
		if (use_devid)
			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
		single_thread_end(s);
		mddb_setexit(s);
		return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno));
	}

	/*
	 * Compute free blocks in replica.
	 */
	computefreeblks(s);

	/*
	 * Check if this is large enough
	 */
	for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next)
		i += mbip1->mbi_mddb_mb.mb_blkcnt;
	for (j = i; j < s->s_totalblkcnt; j++) {
		if (blkcheck(s, j)) {
			while (mbip) {
				mbip1 = mbip->mbi_next;
				kmem_free((caddr_t)mbip, MDDB_IC_BSIZE);
				mbip = mbip1;
			}
			if (use_devid)
				ddi_devid_free(
				    (ddi_devid_t)(uintptr_t)clp->l_devid);
			mddb_devclose(md_expldev(clp->l_dev));
			single_thread_end(s);
			mddb_setexit(s);
			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
			    setno));
		}
	}

	/* Look for a deleted slot */
	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];
		if (lp->l_flags & MDDB_F_DELETED) {
			locator_deleted = 1;
			locator_deleted_dev = lp->l_dev;
			break;
		}
	}

	/* If no deleted slots, add a new one */
	if (li == lbp->lb_loccnt) {
		/* Already have the max replicas, bail */
		if (lbp->lb_loccnt == MDDB_NLB) {
			if (use_devid)
				ddi_devid_free((ddi_devid_t)(uintptr_t)
				    clp->l_devid);
			mddb_devclose(md_expldev(clp->l_dev));
			single_thread_end(s);
			mddb_setexit(s);
			return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
			    setno));
		}
		lbp->lb_loccnt++;
		lp = &lbp->lb_locators[li];
	}

	/* Initialize the new or deleted slot */
	old_flags = lp->l_flags;
	lp->l_dev = clp->l_dev;
	lp->l_blkno = (daddr32_t)clp->l_blkno;
	lp->l_flags = clp->l_flags;

	/* shorthand */
	lnp = s->s_lnp;

	index = 0;
	if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) {
		/*
		 * If a MN diskset, need to find the index where the new
		 * locator information is to be stored in the mnsidelocator
		 * field of the locator block so that the locator name can
		 * be stored at the same array index in the mnsuffixes
		 * field of the locator names structure.
		 */
		lbp->lb_flags |= MDDB_MNSET;
		if ((index = checklocator(lbp, li, s->s_sideno)) == -1) {
			if (use_devid)
				ddi_devid_free((ddi_devid_t)(uintptr_t)clp->
				    l_devid);
			lp->l_flags = old_flags;
			lbp->lb_loccnt--;
			mddb_devclose(md_expldev(clp->l_dev));
			single_thread_end(s);
			mddb_setexit(s);
			return (mdmddberror(ep, MDE_DB_TOOSMALL,
			    NODEV32, setno));
		}
	}
	/*
	 * Store the locator name before the sidelocator information
	 * in case a panic occurs between these 2 steps.  Must have
	 * the locator name information in order to print reasonable
	 * error information.
	 */
	if (splitname2locatorblock(&cp->c_devname, lnp, li,
	    s->s_sideno, index)) {
		if (use_devid)
			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
		lp->l_flags = old_flags;
		lbp->lb_loccnt--;
		mddb_devclose(md_expldev(clp->l_dev));
		single_thread_end(s);
		mddb_setexit(s);
		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
	}

	/*
	 * Compute free blocks in replica before calling cfgloc2locator
	 * since cfgloc2locator may attempt to alloc an unused block
	 * to store the device id.
	 * mbiarray needs to be setup before calling computefreeblks.
	 */
	s->s_mbiarray[li] = mbip;
	computefreeblks(s);

	if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) {
		if (use_devid)
			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
		lp->l_flags = old_flags;
		lbp->lb_loccnt--;
		s->s_mbiarray[li] = 0;
		mddb_devclose(md_expldev(clp->l_dev));
		single_thread_end(s);
		mddb_setexit(s);
		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
	}

	/*
	 * Hijack a deleted rip master record and correct the contents
	 */
	if (locator_deleted) {
		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
			if (rip->ri_lbp != NULL &&
			    rip->ri_mbip == 0 &&
			    (rip->ri_dev == md_expldev(locator_deleted_dev))) {
				rip->ri_dev = md_expldev(clp->l_dev);
				rip->ri_mbip = mbip;

				if (use_devid && clp->l_devid != 0) {
					sz = (int)ddi_devid_sizeof(
					    (ddi_devid_t)(uintptr_t)
					    clp->l_devid);
					rip->ri_devid =
					    (ddi_devid_t)kmem_zalloc(sz,
					    KM_SLEEP);
					bcopy((void *)(uintptr_t)clp->l_devid,
					    (char *)rip->ri_devid, sz);
				}

				break;
			}
		}
	}

	if (use_devid)
		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);

	uniqtime32(&lbp->lb_timestamp);
	lp->l_flags = MDDB_F_ACTIVE;

	/* write db copy to new device */
	err = writecopy(s, li, MDDB_WRITECOPY_ALL);
	lp->l_flags |= MDDB_F_UP2DATE;

	/* write new locator names to all devices */
	uniqtime32(&lnp->ln_timestamp);
	if (lbp->lb_flags & MDDB_MNSET)
		lnp->ln_revision = MDDB_REV_MNLN;
	else
		lnp->ln_revision = MDDB_REV_LN;
	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
	    lbp->lb_lnblkcnt, 0);
	/*
	 * If a MN diskset and this is the master, set the PARSE_LOCNM
	 * flag in the mddb_set structure to show that the locator
	 * names have changed.
	 */

	if ((lbp->lb_flags & MDDB_MNSET) &&
	    (md_set[s->s_setno].s_am_i_master)) {
		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
	}
	if (err) {
		if (writeretry(s)) {
			single_thread_end(s);
			mddb_setexit(s);
			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
		}
	}

	/* Data tags not supported on MN sets */
	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
	    (!(lbp->lb_flags & MDDB_MNSET)) &&
	    setno != MD_LOCAL_SET)
		if (set_dtag(s, ep))
			mdclrerror(ep);

	/* Write data tags to all accessible devices */
	/* Data tags not supported on MN sets */
	if (!(lbp->lb_flags & MDDB_MNSET)) {
		(void) dt_write(s);
	}

	/* write new locator to all devices */
	err = writelocall(s);

	(void) upd_med(s, "newdev(0)");

	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno,
	    md_expldev(clp->l_dev));

	computefreeblks(s); /* recompute always it may be smaller */
	if (err) {
		if (writeretry(s)) {
			single_thread_end(s);
			mddb_setexit(s);
			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
		}
	}

	single_thread_end(s);
	mddb_setexit(s);

	return (0);
}

#ifdef DEBUG
static void
mddb_check_set(
	set_t	setno
)
{
	mddb_set_t	*s;
	mddb_db_t	*dbp;
	mddb_de_ic_t	*dep;
	mddb_rb32_t	*rbp;

	if (! md_set[setno].s_db)
		return;

	s = (mddb_set_t *)md_set[setno].s_db;

	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			rbp = dep->de_rb;
			ASSERT(rbp->rb_magic == MDDB_MAGIC_RB);
			if (dep->de_rb_userdata)
				ASSERT((uintptr_t)dep->de_rb_userdata > 2000);
		}
	}
}
#endif /* DEBUG */

/*
 * Exported Entry Points
 */
#ifdef DEBUG
void
mddb_check(void)
{
	int	i;

	for (i = 0; i < md_nsets; i++) {
		if (! md_set[i].s_db)
			return;

		mddb_check_set(i);
	}

}
#endif /* DEBUG */

int
mddb_configure(
	mddb_cfgcmd_t	command,
	mddb_config_t	*cp
)
{
	mddb_set_t	*s;
	md_error_t	*ep = &cp->c_mde;
	int		flag = 0;
	int		err = 0;
	set_t		setno = cp->c_setno;

	mdclrerror(ep);

	switch (command) {
		case MDDB_NEWDEV:
			err = newdev(cp, command, ep);
			break;

		case MDDB_NEWSIDE:
		case MDDB_DELSIDE:
			err = delnewside(cp, command, ep);
			break;

		case MDDB_GETDEV:
		case MDDB_DELDEV:
		case MDDB_ENDDEV:
			err = getdeldev(cp, command, ep);
			break;

		case MDDB_GETDRVRNAME:
			err = getdriver(&cp->c_locator);
			break;

		case MDDB_USEDEV:
			/*
			 * Note: must allow USEDEV ioctl during upgrade to
			 * support auto-take disksets.
			 *
			 * Also during the set import if the md_devid_destroy
			 * flag is set then error out
			 */

			if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
				return (mdmderror(ep, MDE_INVAL_UNIT,
				    MD_ADM_MINOR));

			if (setno >= md_nsets)
				return (mdmderror(ep, MDE_INVAL_UNIT,
				    MD_ADM_MINOR));

			if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) ==
			    NULL) {
				if ((s = init_set(cp, MDDB_NOINIT, &err)) ==
				    NULL) {
					err = mddbstatus2error(ep, err,
					    NODEV32, setno);
					break;
				}
			}
			if (setno == MD_LOCAL_SET)
				flag = MDDB_F_IOCTL;
			if (cp->c_locator.l_old_devid) {
				md_set_setstatus(setno,
				    MD_SET_REPLICATED_IMPORT);
			}
			err = ridev(&s->s_rip, &cp->c_locator, NULL, flag);
			mddb_setexit(s);
			break;

		case MDDB_RELEASESET:
			mutex_enter(&mddb_lock);
			mddb_unload_set(cp->c_setno);
			mutex_exit(&mddb_lock);
			break;

		case MDDB_SETDID:
			err = setdid(cp);
			break;

		default:
			err = mdmddberror(ep, MDE_DB_INVALID, NODEV32,
			    cp->c_setno);
	}

	return (err);
}

int
mddb_getoptloc(
	mddb_optloc_t		*ol
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	mddb_recid_t		id;
	set_t			setno;

	ol->li[0] = -1;
	ol->li[1] = -1;

	id = ol->recid;
	setno = DBSET(id);
	if (setno >= md_nsets)
		return (EINVAL);

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL)
		return (0);

	id = DBID(id);
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (dep->de_recid != id)
				continue;
			ol->li[0] = dep->de_optinfo[0].o_li;
			ol->li[1] = dep->de_optinfo[1].o_li;
			mddb_setexit(s);
			return (0);
		}
	}
	mddb_setexit(s);
	return (0);
}

void
mddb_init(void)
{
	mddb_set_t	*s;

	mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL);
	if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL)
		mddb_setexit(s);
}


void
mddb_unload(void)
{
	int	i;

	mutex_enter(&mddb_lock);

	for (i = 0; i < md_nsets; i++) {
		md_clr_setstatus(i, MD_SET_KEEPTAG);
		mddb_unload_set(i);
	}

	crcfreetab();

	mutex_exit(&mddb_lock);
}

mddb_recid_t
mddb_createrec(
	size_t		usersize,	 /* size of db record */
	mddb_type_t	type,		 /* type1 of db record */
	uint_t		type2,		 /* type2 of db record */
	md_create_rec_option_t	options, /* options for this creation  */
	set_t		setno		 /* set number to create record in */
)
{
	mddb_set_t	*s;
	mddb_db_t	*dbp, *prevdbp, *newdbp;
	mddb_db32_t	*db32p;
	mddb_de_ic_t	*dep;
	/* LINTED variable unused - used for sizeof calculations */
	mddb_de32_t	*de32p;
	mddb_rb32_t	*rbp;
	size_t		recsize;
	ulong_t		blkcnt;
	ulong_t		maxblocks;
	size_t		desize, desize_ic;
	size_t		used;
	mddb_recid_t	newid;
	caddr_t		tmppnt;
	int		i, err = 0;
	void		*userdata;
	uint_t		flag_type;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

	/*
	 * everyone is supposed to sepcify if it's a
	 * 32 bit or a 64 bit record
	 */
	if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) {
		return (MDDB_E_INVALID);
	}

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
		return (err);

	if (checkstate(s, MDDB_PROBE)) {
		mddb_setexit(s);
		return (MDDB_E_NOTNOW);
	}

	recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) +
	    usersize, MDDB_BSIZE);
	blkcnt = btodb(recsize);

	if (mddb_maxblocks)
		maxblocks = mddb_maxblocks;
	else
		maxblocks = (MDDB_BSIZE - (sizeof (*db32p) + sizeof (*de32p) -
		    sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);

	if (blkcnt > maxblocks) {
		mddb_setexit(s);
		return (MDDB_E_INVALID);
	}
	/*
	 * allocate record block
	 * and new directory block so to avoid sleeping
	 * after starting single_thread
	 */
	rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
	if ((options & MD_CRO_OPTIMIZE) == 0)
		userdata = kmem_zalloc(usersize, KM_SLEEP);
	newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP);

	/*
	 * if this is the largest record allocate new buffer for
	 * checkcopy();
	 */
	if (recsize > s->s_databuffer_size) {
		tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP);
		/*
		 * this test is incase when to sleep during kmem_alloc
		 * and some other task bumped max record size
		 */
		if (recsize > s->s_databuffer_size) {
			if (s->s_databuffer_size)
				kmem_free(s->s_databuffer,
				    s->s_databuffer_size);
			s->s_databuffer = tmppnt;
			s->s_databuffer_size = recsize;
		} else {
			kmem_free(tmppnt, recsize);
		}
	}

	single_thread_start(s);

	newid = 0;
	do {
		newid++;
		if (DBID(newid) == 0) {
			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
			kmem_free((caddr_t)rbp, ((size_t)recsize));
			if ((options & MD_CRO_OPTIMIZE) == 0)
				kmem_free(userdata, usersize);
			single_thread_end(s);
			mddb_setexit(s);
			return (MDDB_E_NOTNOW);
		}

		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
			for (dep = dbp->db_firstentry; dep;
			    dep = dep->de_next) {
				if (dep->de_recid == newid)
					break;
			}
			if (dep != NULL)
				break;
		}
	} while (dbp);

	desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) +
	    (sizeof (mddb_block_t) * blkcnt);

	/*
	 * see if a directory block exists which will hold this entry
	 */
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		used = sizeof (*db32p);
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			used += sizeof (*de32p) - sizeof (de32p->de32_blks);
			used += sizeof (mddb_block_t) * dep->de_blkcount;
		}
		if ((used + desize) < MDDB_BSIZE)
			break;
	}
	if (dbp) {
		kmem_free((caddr_t)newdbp, sizeof (*newdbp));
		if (blkcnt > s->s_freeblkcnt) {
			kmem_free((caddr_t)rbp, ((size_t)recsize));
			if ((options & MD_CRO_OPTIMIZE) == 0)
				kmem_free(userdata, usersize);
			single_thread_end(s);
			mddb_setexit(s);
			return (MDDB_E_NOSPACE);
		}
		prevdbp = NULL;
	} else {
		/*
		 * need to add directory block
		 */
		if ((blkcnt + 1) > s->s_freeblkcnt) {
			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
			kmem_free((caddr_t)rbp, ((size_t)recsize));
			if ((options & MD_CRO_OPTIMIZE) == 0)
				kmem_free(userdata, usersize);
			single_thread_end(s);
			mddb_setexit(s);
			return (MDDB_E_NOSPACE);
		}
		for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next)
			;
		dbp->db_next = newdbp;
		bzero((caddr_t)dbp->db_next, sizeof (*newdbp));
		dbp->db_nextblk = getfreeblks(s, 1);
		dbp->db_next->db_blknum = dbp->db_nextblk;
		prevdbp = dbp;
		dbp = dbp->db_next;
		dbp->db_nextblk = 0;
		dbp->db_firstentry = NULL;
		dbp->db_recsum = 0;
		dbp->db_magic = MDDB_MAGIC_DB;
	}
	/*
	 * ready to add record
	 */
	desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) +
	    (sizeof (mddb_block_t) * blkcnt);
	if (dbp->db_firstentry) {
		for (dep = dbp->db_firstentry; dep->de_next; dep = dep->de_next)
			;
		dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
		dep = dep->de_next;
	} else {
		dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
		dbp->db_firstentry = dep;
	}
	bzero((caddr_t)dep, desize_ic);
	dep->de_recid = newid;
	/*
	 * Optimized records have an owner node associated with them in
	 * a MN diskset.  The owner is only set on a node that is actively
	 * writing to that record.  The other nodes will show that record
	 * as having an invalid owner.  The owner for an optimized record
	 * is used during fixoptrecord to determine which node should
	 * write out the record when the replicas associated with that
	 * optimized record have been changed.
	 */
	if (MD_MNSET_SETNO(s->s_setno)) {
		dep->de_owner_nodeid = MD_MN_INVALID_NID;
	}
	dep->de_type1 =	type;
	dep->de_type2 = type2;
	dep->de_reqsize = usersize;
	dep->de_recsize = recsize;
	dep->de_blkcount = blkcnt;
	flag_type = options &
	    (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID |
	    MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
	    MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
	switch (flag_type) {
	case MD_CRO_OPTIMIZE:
		dep->de_flags = MDDB_F_OPT;
		getoptdev(s, dep, 0);
		getoptdev(s, dep, 1);
		break;
	case MD_CRO_STRIPE:
		dep->de_flags = MDDB_F_STRIPE;
		break;
	case MD_CRO_MIRROR:
		dep->de_flags = MDDB_F_MIRROR;
		break;
	case MD_CRO_RAID:
		dep->de_flags = MDDB_F_RAID;
		break;
	case MD_CRO_SOFTPART:
		dep->de_flags = MDDB_F_SOFTPART;
		break;
	case MD_CRO_TRANS_MASTER:
		dep->de_flags = MDDB_F_TRANS_MASTER;
		break;
	case MD_CRO_TRANS_LOG:
		dep->de_flags = MDDB_F_TRANS_LOG;
		break;
	case MD_CRO_HOTSPARE:
		dep->de_flags = MDDB_F_HOTSPARE;
		break;
	case MD_CRO_HOTSPARE_POOL:
		dep->de_flags = MDDB_F_HOTSPARE_POOL;
		break;
	case MD_CRO_CHANGELOG:
		dep->de_flags = MDDB_F_CHANGELOG;
		break;
	}
	/*
	 * try to get all blocks consecutive. If not possible
	 * just get them one at a time
	 */
	dep->de_blks[0] = getfreeblks(s, blkcnt);
	if (dep->de_blks[0]) {
		for (i = 1; i < blkcnt; i++)
			dep->de_blks[i] = dep->de_blks[0] + i;
	} else {
		for (i = 0; i < blkcnt;	 i++)
			dep->de_blks[i] = getfreeblks(s, 1);
	}
	dep->de_rb = rbp;
	bzero((caddr_t)rbp, recsize);
	rbp->rb_magic = MDDB_MAGIC_RB;

	/* Do we have to create an old style (32 bit) record?  */
	if (options & MD_CRO_32BIT) {
		if (options & MD_CRO_FN)
			rbp->rb_revision = MDDB_REV_RBFN;
		else
			rbp->rb_revision = MDDB_REV_RB;
	} else {
		if (options & MD_CRO_FN)
			rbp->rb_revision = MDDB_REV_RB64FN;
		else
			rbp->rb_revision = MDDB_REV_RB64;
	}

	/* set de_rb_userdata for non optimization records */
	if ((options & MD_CRO_OPTIMIZE) == 0) {
		dep->de_rb_userdata = userdata;
	}

	uniqtime32(&rbp->rb_timestamp);
	/* Generate the crc for this record */
	rec_crcgen(s, dep, rbp);
	tmppnt = (caddr_t)rbp;
	/*
	 * the following code writes new records to all instances of
	 * the data base. Writing one block at a time to each instance
	 * is safe because they are not yet in a directory entry which
	 * has been written to the data base
	 */
	err = 0;
	if ((options & MD_CRO_OPTIMIZE) == 0) {
		for (i = 0; i < blkcnt;	 i++) {
			err |= writeall(s, (caddr_t)tmppnt,
			    dep->de_blks[i], 1, 0);
			tmppnt += MDDB_BSIZE;
		}
	} else {
		if ((MD_MNSET_SETNO(s->s_setno)) &&
		    md_set[s->s_setno].s_am_i_master) {
		/*
		 * If a MN diskset then only master writes out newly
		 * created optimized record.
		 */
			err |= writeoptrecord(s, dep);
		}
	}
	uniqtime32(&dbp->db_timestamp);
	dbp->db_revision = MDDB_REV_DB;
	/* Don't include opt resync and change log records in global XOR */
	if (!(dep->de_flags & MDDB_F_OPT) &&
	    !(dep->de_flags & MDDB_F_CHANGELOG))
		dbp->db_recsum ^= rbp->rb_checksum;
	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
	create_db32rec(db32p, dbp);
	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
	err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
	if (prevdbp) {
		dbp = prevdbp;
		uniqtime32(&dbp->db_timestamp);
		dbp->db_revision = MDDB_REV_DB;
		create_db32rec(db32p, dbp);
		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
		err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
	}
	kmem_free((caddr_t)db32p, MDDB_BSIZE);
	if (err) {
		if (writeretry(s)) {
			s->s_zombie = newid;
			single_thread_end(s);
			mddb_setexit(s);
			return (MDDB_E_NOTNOW);
		}
	}
	single_thread_end(s);
	mddb_setexit(s);

	ASSERT((newid & MDDB_SETMASK) == 0);
	return (MAKERECID(setno, newid));
}

int
mddb_deleterec(
	mddb_recid_t	id
)
{
	mddb_set_t	*s;
	mddb_db_t	*dbp;
	mddb_db32_t	*db32p;
	mddb_de_ic_t	*dep, *dep1;
	int		i;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
	ASSERT(s != NULL);

	id = DBID(id);
	if (checkstate(s, MDDB_PROBE)) {
		mddb_setexit(s);
		return (MDDB_E_NOTNOW);
	}

	ASSERT(s->s_lbp != NULL);
	single_thread_start(s);

	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		dep1 = NULL;
		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
			if (dep->de_recid == id)
				break;
			dep1 = dep;
		}
		if (dep != NULL)
			break;
	}
	/*
	 * no such record
	 */
	if (dep == NULL) {
		single_thread_end(s);
		ASSERT(s->s_staledeletes != 0);
		s->s_staledeletes--;
		mddb_setexit(s);
		return (0);
	}

	if (!(dep->de_flags & MDDB_F_OPT) &&
	    !(dep->de_flags & MDDB_F_CHANGELOG)) {
		dbp->db_recsum ^= dep->de_rb->rb_checksum;
		dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle;
	}

	if (dep->de_rb_userdata != NULL) {
		if (dep->de_icreqsize)
			kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize);
		else
			kmem_free(dep->de_rb_userdata, dep->de_reqsize);
	}

	kmem_free((caddr_t)dep->de_rb, dep->de_recsize);

	for (i = 0; i < dep->de_blkcount; i++)
		blkfree(s, dep->de_blks[i]);
	if (dep1)
		dep1->de_next = dep->de_next;
	else
		dbp->db_firstentry = dep->de_next;

	kmem_free(dep, sizeofde(dep));

	uniqtime32(&dbp->db_timestamp);
	dbp->db_revision = MDDB_REV_DB;
	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
	create_db32rec(db32p, dbp);
	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
	if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) {
		if (writeretry(s)) {
			/*
			 * staledelete is used to mark deletes which failed.
			 * its only use is to not panic when the user retries
			 * the delete once the database is active again
			 */
			single_thread_end(s);
			s->s_staledeletes++;
			kmem_free((caddr_t)db32p, MDDB_BSIZE);
			mddb_setexit(s);
			return (MDDB_E_NOTNOW);
		}
	}
	single_thread_end(s);
	kmem_free((caddr_t)db32p, MDDB_BSIZE);
	mddb_setexit(s);
	return (0);
}

mddb_recid_t
mddb_getnextrec(
	mddb_recid_t		id,
	mddb_type_t		typ,
	uint_t			type2
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	int			searching, err;
	set_t			setno;

	setno = DBSET(id);
	id = DBID(id);
	searching = id;

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
		return (err);

	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (searching) {
				if (dep->de_recid == id)
					searching = 0;
			} else {
				if ((typ == MDDB_ALL || dep->de_type1 == typ) &&
				    (type2 == 0 || dep->de_type2 == type2)) {
					id = dep->de_recid;
					mddb_setexit(s);
					ASSERT((id & MDDB_SETMASK) == 0);
					return (MAKERECID(setno, id));
				}
			}
		}
	}

	mddb_setexit(s);

	if (searching)
		return (MDDB_E_NORECORD);
	return (0);
}

void *
mddb_getrecaddr(
	mddb_recid_t		id
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	void			*rval;

	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
		return (NULL);

	id = DBID(id);
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (dep->de_recid != id)
				continue;
			if (dep->de_rb_userdata)
				rval = (void *)dep->de_rb_userdata;
			else
				rval = (void *)dep->de_rb->rb_data;
			mddb_setexit(s);
			return (rval);
		}
	}

	mddb_setexit(s);
	return (NULL);
}


mddb_de_ic_t *
mddb_getrecdep(
	mddb_recid_t		id
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;

	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
		return (NULL);

	id = DBID(id);
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (dep->de_recid != id)
				continue;
			mddb_setexit(s);
			return (dep);
		}
	}

	mddb_setexit(s);
	return (NULL);
}

void *
mddb_getrecaddr_resize(
	mddb_recid_t		id,
	size_t			icsize,
	off_t			off
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	void			*rval = NULL;

	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
		return (NULL);

	id = DBID(id);
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (dep->de_recid != id)
				continue;
			if (dep->de_rb_userdata)
				rval = (void *)dep->de_rb_userdata;
			else
				rval = (void *)dep->de_rb->rb_data;
			break;
		}
		if (rval != NULL)
			break;
	}

	if (rval == NULL) {
		mddb_setexit(s);
		return (NULL);
	}

	if (dep->de_rb_userdata) {
		caddr_t nud;

		if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) {
			mddb_setexit(s);
			return (rval);
		}
		ASSERT((dep->de_reqsize + off) <= icsize);
		nud = kmem_zalloc(icsize, KM_SLEEP);
		bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize);
		kmem_free(dep->de_rb_userdata, dep->de_reqsize);
		dep->de_rb_userdata = nud + off;
		dep->de_rb_userdata_ic = nud;
		dep->de_icreqsize = icsize;
		rval = nud;
	} else {
		size_t recsize;
		/* LINTED variable unused - used for sizeof calculations */
		mddb_rb32_t *nrbp;

		recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) +
		    icsize, MDDB_BSIZE);
		if (dep->de_recsize < recsize)
			cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only "
			    "nonoptimized records can be resized\n");
	}

	mddb_setexit(s);
	return (rval);
}

int
mddb_getrecprivate(
	mddb_recid_t		id
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	int			err = 0;
	int			private;

	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
		return (err);

	id = DBID(id);
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (dep->de_recid != id)
				continue;
			private = (int)dep->de_rb->rb_private;
			mddb_setexit(s);
			return (private);
		}
	}

	mddb_setexit(s);
	return (MDDB_E_NORECORD);
}

void
mddb_setrecprivate(
	mddb_recid_t		id,
	uint_t			private
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;

	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) {
		ASSERT(0);
		return;
	}

	id = DBID(id);
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (dep->de_recid != id)
				continue;
			dep->de_rb->rb_private = private;
			mddb_setexit(s);
			return;
		}
	}

	mddb_setexit(s);
	ASSERT(0);
}

mddb_type_t
mddb_getrectype1(
	mddb_recid_t		id
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	int			err = 0;
	mddb_type_t		rval;

	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
		return (err);

	id = DBID(id);
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (dep->de_recid != id)
				continue;
			rval = dep->de_type1;
			mddb_setexit(s);
			return (rval);
		}
	}

	mddb_setexit(s);
	return (MDDB_E_NORECORD);
}

int
mddb_getrectype2(
	mddb_recid_t		id
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	int			err = 0;
	int			rval;

	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
		return (err);

	id = DBID(id);
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (dep->de_recid != id)
				continue;
			rval = (int)dep->de_type2;
			mddb_setexit(s);
			return (rval);
		}
	}

	mddb_setexit(s);
	return (MDDB_E_NORECORD);
}

int
mddb_getrecsize(
	mddb_recid_t		id
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	int			err = 0;
	int			rval;

	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
		return (err);

	id = DBID(id);
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (dep->de_recid != id)
				continue;
			rval = (int)dep->de_reqsize;
			mddb_setexit(s);
			return (rval);
		}
	}

	mddb_setexit(s);
	return (MDDB_E_NORECORD);
}


mddb_recstatus_t
mddb_getrecstatus(
	mddb_recid_t		id
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	int			err = 0;
	mddb_recstatus_t	e_err;

	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
		return ((mddb_recstatus_t)err);

	id = DBID(id);
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (dep->de_recid == id)
				break;
		}
		if (dep)
			break;
	}

	e_err = MDDB_OK;

	if (! dep)
		e_err = MDDB_NORECORD;
	else if (! dep->de_rb->rb_commitcnt)
		e_err = MDDB_NODATA;
	else if (md_get_setstatus(s->s_setno) & MD_SET_STALE)
		e_err = MDDB_STALE;

	mddb_setexit(s);
	return (e_err);
}

static int	mddb_commitrec_retries = 5;

/*
 * Commit given record to disk.
 * If committing an optimized record, do not call
 * with md ioctl lock held.
 */
int
mddb_commitrec(
	mddb_recid_t	id
)
{
	mddb_set_t			*s;
	mddb_db_t			*dbp;
	mddb_de_ic_t			*dep;
	mddb_recid_t			ids[2];
	mddb_rb32_t			*rbp;
	static int			err = 0;
	md_mn_msg_mddb_optrecerr_t	*msg_recerr;
	md_mn_kresult_t			*kres;
	mddb_lb_t			*lbp;
	mddb_mnlb_t			*mnlbp;
	mddb_locator_t			*lp;
	mddb_mnsidelocator_t		*mnslp;
	mddb_drvnm_t			*dn;
	int				li;
	md_replica_recerr_t		*recerr;
	int				i, j;
	int				rval;
	int				hit_err = 0;
	int				retry = mddb_commitrec_retries;
	int				gave_up = 0;

	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
	ASSERT(s != NULL);

	if (checkstate(s, MDDB_PROBE)) {
		mddb_setexit(s);
		return (MDDB_E_NOTNOW);
	}

	if (DBID(id) == 0) {
		mddb_setexit(s);
		return (0);
	}

	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
			if (dep->de_recid == DBID(id))
				break;
		}
		if (dep)
			break;
	}

	if (dep == NULL) {
		mddb_setexit(s);
		return (MDDB_E_NORECORD);
	}

	if (! (dep->de_flags & MDDB_F_OPT)) {
		ids[0] = id;
		ids[1] = 0;
		mddb_setexit(s);
		return (mddb_commitrecs(ids));
	}

	/*
	 * following code allows multiple processes to be doing
	 * optimization commits in parallel.
	 * NOTE: if lots of optimization commits then the lock
	 * will not get released until it winds down
	 */
	if (s->s_optwaiterr) {
		while (s->s_optwaiterr) {
			s->s_opthungerr = 1;
			cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno));
		}
		if (checkstate(s, MDDB_PROBE)) {
			mddb_setexit(s);
			return (MDDB_E_NOTNOW);
		}
	}
	if (s->s_optcmtcnt++ == 0) {
		single_thread_start(s);
		s->s_opthavelck = 1;
		if (s->s_optwantlck) {
			cv_broadcast(&s->s_optwantlck_cv);
			s->s_optwantlck = 0;
		}
	} else {
		while (! s->s_opthavelck) {
			s->s_optwantlck = 1;
			cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno));
		}
	}

	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
			if (dep->de_recid == DBID(id))
				break;
		}
		if (dep)
			break;
	}

	if (dep == NULL) {
		if (! (--s->s_optcmtcnt)) {
			single_thread_end(s);
			s->s_opthavelck = 0;
		}
		mddb_setexit(s);
		return (MDDB_E_NORECORD);
	}

	rbp = dep->de_rb;
	rbp->rb_commitcnt++;
	uniqtime32(&rbp->rb_timestamp);
	/* Generate the crc for this record */
	rec_crcgen(s, dep, rbp);

	if (writeoptrecord(s, dep)) {
		if (MD_MNSET_SETNO(s->s_setno)) {
			hit_err = 1;
		}
		s->s_optwaiterr++;
	}
	if (MD_MNSET_SETNO(s->s_setno)) {
		/* If last thread out, release single_thread_start */
		if (! (--s->s_optcmtcnt)) {
			single_thread_end(s);
			s->s_opthavelck = 0;
		}
		/*
		 * If this thread had a writeoptrecords failure, then
		 * need to send message to master.
		 * But, multiple threads could all be running on the
		 * same single_thread_start, so serialize the threads
		 * by making each thread grab single_thread_start.
		 *
		 * After return from sending message to master message,
		 * replicas associated with optimized record will havei
		 * been changed (via a callback from the master to all
		 * nodes), so retry call to writeoptrecord.
		 * This code is replacing the call to writeretry that
		 * occurs for the local and traditional disksets.
		 */
		if (hit_err) {
			single_thread_start(s);
			/*
			 * If > 50% of replicas are alive then continue
			 * to send message to master until writeoptrecord
			 * succeeds.  For now, assume that minor name,
			 * major number on this node is the same as on
			 * the master node.  Once devids are turned on
			 * for MN disksets, can send devid.
			 */
			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
			msg_recerr = kmem_zalloc(
			    sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP);
			while (!(md_get_setstatus(s->s_setno) &
			    MD_SET_TOOFEW)) {
				bzero((caddr_t)msg_recerr,
				    sizeof (md_mn_msg_mddb_optrecerr_t));
				lbp = s->s_lbp;
				mnlbp = (mddb_mnlb_t *)lbp;
				for (i = 0; i < 2; i++) {
					li = dep->de_optinfo[i].o_li;
					lp = &lbp->lb_locators[li];
					for (j = 0; j < MD_MNMAXSIDES; j++) {
						mnslp =
						    &mnlbp->
						    lb_mnsidelocators[j][li];
						if (mnslp->mnl_sideno ==
						    s->s_sideno)
							break;
					}
					if (j == MD_MNMAXSIDES)
						continue;

					dn = &lbp->
					    lb_drvnm[mnslp->mnl_drvnm_index];
					recerr = &msg_recerr->msg_recerr[i];
					recerr->r_li = li;
					recerr->r_flags =
					    dep->de_optinfo[i].o_flags;
					recerr->r_blkno = lp->l_blkno;
					recerr->r_mnum = md_getminor(lp->l_dev);
					(void) strncpy(recerr->r_driver_name,
					    dn->dn_data, MD_MAXDRVNM);
				}

				/* Release locks */
				single_thread_end(s);
				mutex_exit(SETMUTEX(s->s_setno));

				/*
				 * Send message to master about optimized
				 * record failure.  After return, master
				 * should have marked failed replicas
				 * and sent parse message to slaves causing
				 * slaves to have fixed up the optimized
				 * record.
				 * On return from ksend_message, retry
				 * the write since this node should have fixed
				 * the optimized resync records it owns.
				 */
				rval = mdmn_ksend_message(s->s_setno,
				    MD_MN_MSG_MDDB_OPTRECERR,
				    MD_MSGF_NO_BCAST, 0,
				    (char *)msg_recerr,
				    sizeof (md_mn_msg_mddb_optrecerr_t),
				    kres);
				if (!MDMN_KSEND_MSG_OK(rval, kres)) {
					cmn_err(CE_WARN, "mddb_commitrec: "
					    "Unable to send optimized "
					    "resync record failure "
					    "message to other nodes in "
					    "diskset %s\n", s->s_setname);
					mdmn_ksend_show_error(rval, kres,
					    "MD_MN_MSG_MDDB_OPTRECERR");
				}

				/* Regrab locks */
				mutex_enter(SETMUTEX(s->s_setno));
				single_thread_start(s);

				/* Start over in case mddb changed */
				for (dbp = s->s_dbp; dbp != NULL;
				    dbp = dbp->db_next) {
					for (dep = dbp->db_firstentry; dep;
					    dep = dep->de_next) {
						if (dep->de_recid == DBID(id))
							break;
					}
					if (dep)
						break;
				}
				if (dep) {
					rbp = dep->de_rb;
					rbp->rb_commitcnt++;
					uniqtime32(&rbp->rb_timestamp);
					/* Generate the crc for this record */
					rec_crcgen(s, dep, rbp);

					/*
					 * If writeoptrecord succeeds, then
					 * break out.
					 */
					if (!(writeoptrecord(s, dep)))
						break;
				}
				if (--retry == 0) {
					cmn_err(CE_WARN, "mddb_commitrec: "
					    "giving up writing optimized "
					    "resync record for "
					    "diskset %s, device %s,%d "
					    "blkno 0x%x, flags 0x%x\n",
					    s->s_setname, recerr->r_driver_name,
					    recerr->r_mnum, recerr->r_blkno,
					    recerr->r_flags);
					gave_up++;
					break;
				}
			}
			kmem_free(kres, sizeof (md_mn_kresult_t));
			kmem_free(msg_recerr,
			    sizeof (md_mn_msg_mddb_optrecerr_t));

			/* Resync record should be fixed - if possible */
			s->s_optwaiterr--;
			if (s->s_optwaiterr == 0) {
				/* All errors have been handled */
				if (s->s_opthungerr) {
					s->s_opthungerr = 0;
					cv_broadcast(&s->s_opthungerr_cv);
				}
			}
			single_thread_end(s);
			mddb_setexit(s);
			if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) {
				return (MDDB_E_NOTNOW);
			} else if (gave_up) {
				return (MDDB_E_STALE);
			} else {
				return (0);
			}
		}
	} else {
		/* If set is a traditional or local set */
		if (! (--s->s_optcmtcnt)) {
			err = 0;
			if (s->s_optwaiterr) {
				err = writeretry(s);
				s->s_optwaiterr = 0;
				if (s->s_opthungerr) {
					s->s_opthungerr = 0;
					cv_broadcast(&s->s_opthungerr_cv);
				}
			}
			single_thread_end(s);
			s->s_opthavelck = 0;
			mddb_setexit(s);
			if (err)
				return (MDDB_E_NOTNOW);
			return (0);
		}
		if (s->s_optwaiterr) {
			while (s->s_optwaiterr) {
				s->s_opthungerr = 1;
				cv_wait(&s->s_opthungerr_cv,
				    SETMUTEX(s->s_setno));
			}
			if (checkstate(s, MDDB_NOPROBE)) {
				mddb_setexit(s);
				return (MDDB_E_NOTNOW);
			}
		}
	}

	mddb_setexit(s);
	return (0);
}

int
mddb_commitrecs(
	mddb_recid_t	ids[]
)
{
	mddb_set_t	*s;
	mddb_db_t	*dbp;
	mddb_de_ic_t	*dep;
	mddb_rb32_t	*rbp;
	mddb_rb32_t	*saverbp;
	mddb_lb_t	*lbp;
	int		li;
	uint_t		checksum;
	mddb_recid_t	*idp;
	int		err = 0;
	set_t		setno;

	if (panicstr)
		cmn_err(CE_PANIC, "md: mddb: commit not allowed");

	/*
	 * scan through and make sure ids are from the same set
	 */
	setno = DBSET(ids[0]);
	for (idp = ids; *idp != NULL; idp++)
		ASSERT(DBSET(*idp) == setno);

	s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL);

	if (checkstate(s, MDDB_PROBE)) {
		mddb_setexit(s);
		return (MDDB_E_NOTNOW);
	}

	ASSERT(s->s_lbp != NULL);
	err = 0;

	if (! ids[0]) {
		mddb_setexit(s);
		return (0);
	}

	single_thread_start(s);
	/*
	 * scan through and make sure ids all exist
	 */
	for (idp = ids; *idp != NULL; idp++) {
		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
			for (dep = dbp->db_firstentry; dep;
			    dep = dep->de_next) {
				if (dep->de_recid == DBID(*idp))
					break;
			}
			if (dep != NULL)
				break;
		}
		if (dep == NULL) {
			single_thread_end(s);
			mddb_setexit(s);
			return (MDDB_E_NORECORD);
		}
	}

	/*
	 * scan through records fix commit counts and
	 * zero fiddles and update time stamp and rechecksum record
	 */
	checksum = 0;
	idp = ids;
	saverbp = NULL;
	while (*idp) {
		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
			for (dep = dbp->db_firstentry; dep;
			    dep = dep->de_next) {
				if (dep->de_recid == DBID(*idp))
					break;
			}
			if (dep != NULL)
				break;
		}
		rbp = dep->de_rb;
		ASSERT(! (dep->de_flags & MDDB_F_OPT));

		getuserdata(setno, dep);
		/* Don't do fiddles for CHANGE LOG records */
		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
			checksum ^= rbp->rb_checksum_fiddle;
			rbp->rb_checksum_fiddle = 0;
			checksum ^= rbp->rb_checksum;
			saverbp = rbp;
		}
		rbp->rb_commitcnt++;
		uniqtime32(&rbp->rb_timestamp);
		/* Generate the crc for this record */
		rec_crcgen(s, dep, rbp);

		/* Don't do fiddles for CHANGE LOG records */
		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
			checksum ^= rbp->rb_checksum;
		}
		idp++;
	}

	if (saverbp)
		saverbp->rb_checksum_fiddle = checksum;

	/*
	 * If this is a MN set but we are not the master, then we are not
	 * supposed to update the mddb on disk. So we finish at this point.
	 */
	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
	    (md_set[setno].s_am_i_master == 0)) {
		single_thread_end(s);
		mddb_setexit(s);
		return (0);
	}

	lbp = s->s_lbp;
	for (li = 0; li < lbp->lb_loccnt; li++) {
		if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE))
			continue;

		idp = ids;
		while (*idp) {
			for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
				dep = dbp->db_firstentry;
				while (dep && (dep->de_recid != DBID(*idp)))
					dep = dep->de_next;
				if (dep != NULL)
					break;
			}
			rbp = dep->de_rb;
			err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
			    dep->de_blkcount, li, (mddb_bf_t **)0,
			    MDDB_WR_ONLY_MASTER);
			if (err)
				break;
			idp++;
		}
		if (err)
			break;
	}
	if (err) {
		if (writeretry(s)) {
			single_thread_end(s);
			mddb_setexit(s);
			return (MDDB_E_NOTNOW);
		}
	}
	single_thread_end(s);
	mddb_setexit(s);
	return (0);
}

mddb_recid_t
mddb_makerecid(
	set_t		setno,
	mddb_recid_t	id
)
{
	return (MAKERECID(setno, id));
}

set_t
mddb_getsetnum(
	mddb_recid_t	id
)
{
	return (DBSET(id));
}

char *
mddb_getsetname(
	set_t	setno
)
{
	return (((mddb_set_t *)md_set[setno].s_db)->s_setname);
}

side_t
mddb_getsidenum(
	set_t	setno
)
{
	if (md_set[setno].s_db)
		return (((mddb_set_t *)md_set[setno].s_db)->s_sideno);
	return (0);
}

int
mddb_ownset(
	set_t	setno
)
{
	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db)
		return (1);

	if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp)
		return (1);

	return (0);
}

/*ARGSUSED*/
int
getmed_ioctl(mddb_med_parm_t *medpp, int mode)
{
	mddb_set_t	*s;
	int		err = 0;
	set_t		setno = medpp->med_setno;
	md_error_t	*ep = &medpp->med_mde;

	mdclrerror(ep);

	if (setno >= md_nsets)
		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
		return (0);

	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
		return (mddbstatus2error(ep, err, NODEV32, setno));

	medpp->med = s->s_med;			/* structure assignment */

	mddb_setexit(s);

	return (0);
}

int
setmed_ioctl(mddb_med_parm_t *medpp, int mode)
{

	mddb_set_t	*s;
	int		err = 0;
	set_t		setno = medpp->med_setno;
	md_error_t	*ep = &medpp->med_mde;

	mdclrerror(ep);

	if ((mode & FWRITE) == 0)
		return (mdsyserror(ep, EACCES));

	/*
	 * This should be the only thing that prevents LOCAL sets from having
	 * mediators, at least in the kernel, userland needs to have some code
	 * written.
	 */
	if (setno == MD_LOCAL_SET)
		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

	if (setno >= md_nsets)
		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
		return (0);

	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
		return (mddbstatus2error(ep, err, NODEV32, setno));

	s->s_med = medpp->med;			/* structure assignment */

	mddb_setexit(s);

	return (0);
}

int
updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode)
{

	mddb_set_t	*s;
	int		err = 0;
	set_t		setno = medpp->med_setno;
	md_error_t	*ep = &medpp->med_mde;

	mdclrerror(ep);

	if ((mode & FWRITE) == 0)
		return (mdsyserror(ep, EACCES));

	if (setno >= md_nsets)
		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
		return (0);

	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
		return (mddbstatus2error(ep, err, NODEV32, setno));

	single_thread_start(s);
	(void) upd_med(s, "updmed_ioctl()");
	single_thread_end(s);

	mddb_setexit(s);

	return (0);
}

int
take_set(mddb_config_t *cp, int mode)
{
	int			err = 0;
	mddb_med_upd_parm_t	medup;
	set_t			setno = cp->c_setno;
	md_error_t		*ep = &cp->c_mde;
	int			snarf_ok = 0;

	if (md_get_setstatus(setno) & MD_SET_SNARFED)
		return (0);

	err = mddb_configure(MDDB_GETDEV, cp);
	if (! err && mdisok(ep)) {
		if (md_snarf_db_set(setno, ep) != 0)
			goto out;
		snarf_ok = 1;
	}

	/*
	 * Clear replicated import flag since this is
	 * used during the take of a diskset with
	 * previously unresolved replicated disks.
	 */
	if (md_get_setstatus(setno) &
	    MD_SET_REPLICATED_IMPORT) {
		md_clr_setstatus(setno, MD_SET_REPLICATED_IMPORT);
	}

	if (! err && mdisok(ep)) {
		if (! cp->c_flags) {
			medup.med_setno = setno;
			mdclrerror(&medup.med_mde);

			err = updmed_ioctl(&medup, mode);
			if (! mdisok(&medup.med_mde))
				(void) mdstealerror(ep, &medup.med_mde);
		}
	}

out:
	/*
	 * In the case that the snarf failed, the diskset is
	 * left with s_db set, but s_lbp not set.  The node is not
	 * an owner of the set and won't be allowed to release the
	 * diskset in order to cleanup.  With s_db set, any call to the
	 * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist)
	 * will cause the diskset to be loaded.  So, cleanup the diskset so
	 * that an inadvertent start of the diskset doesn't happen later.
	 */
	if ((snarf_ok == 0) && md_set[setno].s_db &&
	    (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) {
		mutex_enter(&mddb_lock);
		mddb_unload_set(setno);
		mutex_exit(&mddb_lock);
	}
	return (err);
}

/*ARGSUSED*/
int
release_set(mddb_config_t *cp, int mode)
{
	int			err = 0;
	set_t			setno = cp->c_setno;
	md_error_t		*ep = &cp->c_mde;

	/*
	 * Data integrity check
	 */
	if (setno >= md_nsets)
		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
	md_haltsnarf_enter(setno);
	/*
	 * Attempt to mark set as HOLD. If it is marked as HOLD, this means
	 * that the mirror code is currently searching all mirrors for a
	 * errored component that needs a hotspare. While this search is in
	 * progress, we cannot release the set and thgerefore we return EBUSY.
	 * Once we have set HOLD, the mirror function (check_4_hotspares) will
	 * block before the search until the set is released.
	 */
	if (md_holdset_testandenter(setno) != 0) {
		md_haltsnarf_exit(setno);
		rw_exit(&md_unit_array_rw.lock);
		return (EBUSY);
	}

	if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0)
		err = mddb_configure(MDDB_RELEASESET, cp);

	md_holdset_exit(setno);
	md_haltsnarf_exit(setno);
	rw_exit(&md_unit_array_rw.lock);

	if (! err && mdisok(ep)) {
		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno,
		    NODEV64);
	}

	return (err);
}

int
gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode)
{
	mddb_set_t	*s;
	int		err = 0;
	mddb_dtag_lst_t	*dtlp;
	set_t		setno = dtgpp->dtgp_setno;
	md_error_t	*ep = &dtgpp->dtgp_mde;

	mdclrerror(ep);

	if ((mode & FREAD) == 0)
		return (mdsyserror(ep, EACCES));

	if (setno >= md_nsets)
		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
		return (0);

	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
		return (mddbstatus2error(ep, err, NODEV32, setno));

	/*
	 * Data tags not supported on MN sets so return invalid operation.
	 * This ioctl could be called before the mddb has been read in so
	 * the set status may not yet be set to MNSET, so code following
	 * this check must handle a MN diskset properly.
	 */
	if (md_get_setstatus(setno) & MD_SET_MNSET) {
		mddb_setexit(s);
		return (mderror(ep, MDE_INVAL_MNOP));
	}

	/* s_dtlp is NULL for MN diskset */
	dtlp = s->s_dtlp;
	while (dtlp != NULL) {
		if (dtgpp->dtgp_dt.dt_id == 0 ||
		    dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) {
			bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt,
			    sizeof (mddb_dtag_t));
			break;
		}
		dtlp = dtlp->dtl_nx;
	}

	/* Walked the whole list and id not found, return error */
	if (dtlp == (mddb_dtag_lst_t *)NULL) {
		mddb_setexit(s);
		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
	}

	mddb_setexit(s);

	return (0);
}

int
usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode)
{
	mddb_set_t	*s;
	int		err = 0;
	mddb_config_t	*cp;
	mddb_ri_t	*trip = NULL;
	mddb_dtag_t	*dtagp = NULL;
	set_t		setno = dtupp->dtup_setno;
	md_error_t	*ep = &dtupp->dtup_mde;

	mdclrerror(ep);

	if ((mode & FWRITE) == 0)
		return (mdsyserror(ep, EACCES));

	if (setno >= md_nsets)
		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

	if (dtupp->dtup_id < 0)
		return (mdsyserror(ep, EINVAL));
	else if (dtupp->dtup_id == 0)
		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));

	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
		return (0);

	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0)
		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));

	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
		return (mddbstatus2error(ep, err, NODEV32, setno));

	/*
	 * Data tags not supported on MN sets so return invalid operation.
	 * This ioctl could be called before the mddb has been read in so
	 * the set status may not yet be set to MNSET, so code following
	 * this check must handle a MN diskset properly.
	 */
	if (md_get_setstatus(setno) & MD_SET_MNSET) {
		mddb_setexit(s);
		return (mderror(ep, MDE_INVAL_MNOP));
	}

	/* Validate and find the id requested - nothing found if MN diskset */
	if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) {
		mddb_setexit(s);
		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
	}

	/* Usetag is only valid when more than one tag exists */
	if (dtl_cntl(s) < 2) {
		mddb_setexit(s);
		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
	}

	/* Put the selected tag in place */
	dt_setup(s, dtagp);

	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);

	/* Save the hint information */
	trip = save_rip(s);

	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
	cp->c_setno = setno;
	cp->c_sideno = s->s_sideno;
	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
	cp->c_setname[MD_MAX_SETNAME] = '\0';
	cp->c_med = s->s_med;				/* struct assignment */

	mddb_setexit(s);

	s = NULL;

	/* shorthand */
	setno = cp->c_setno;

	/* Let unload know not to free the tag */
	md_set_setstatus(setno, MD_SET_KEEPTAG);

	/* Release the set */
	if (err = release_set(cp, mode))
		goto out;

	if (! mdisok(&cp->c_mde)) {
		(void) mdstealerror(ep, &cp->c_mde);
		err = 1;
		goto out;
	}

	/* Re-init set using the saved mddb_config_t structure */
	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
			err = mddbstatus2error(ep, err, NODEV32, setno);
			goto out;
		}
	}

	ASSERT(s->s_rip == (mddb_ri_t *)NULL);

	/* use the saved rip structure */
	s->s_rip = trip;
	trip = (mddb_ri_t *)NULL;

	/* Let the take code know a tag is being used */
	md_set_setstatus(setno, MD_SET_USETAG);

	mddb_setexit(s);

	s = NULL;

	/* Take the set */
	if (err = take_set(cp, mode))
		goto out;

	if (! mdisok(&cp->c_mde))
		(void) mdstealerror(ep, &cp->c_mde);

out:
	md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG));

	kmem_free(cp, sizeof (mddb_config_t));

	if (trip)
		free_rip(&trip);

	if (s)
		mddb_setexit(s);

	return (err);
}

int
accept_ioctl(mddb_accept_parm_t *accpp, int mode)
{
	mddb_set_t	*s;
	int		err = 0;
	mddb_config_t	*cp;
	mddb_ri_t	*trip = NULL;
	set_t		setno = accpp->accp_setno;
	md_error_t	*ep = &accpp->accp_mde;

	mdclrerror(ep);

	if ((mode & FWRITE) == 0)
		return (mdsyserror(ep, EACCES));

	if (setno >= md_nsets)
		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));

	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
		return (0);

	if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0)
		return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno));

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
		return (mddbstatus2error(ep, err, NODEV32, setno));

	/*
	 * Data tags not supported on MN sets so return invalid operation.
	 * mddb is guaranteed to be incore at this point, so this
	 * check will catch all MN disksets.
	 */
	if (md_get_setstatus(setno) & MD_SET_MNSET) {
		mddb_setexit(s);
		return (mderror(ep, MDE_INVAL_MNOP));
	}

	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);

	trip = save_rip(s);

	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
	cp->c_setno = setno;
	cp->c_sideno = s->s_sideno;
	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
	cp->c_setname[MD_MAX_SETNAME] = '\0';
	cp->c_med = s->s_med;				/* struct assignment */

	/* Tag the data */
	if (err = set_dtag(s, ep)) {
		err = mdsyserror(ep, err);
		goto out;
	}

	/* If we had a BADTAG, it will be re-written, so clear the bit. */
	if (md_get_setstatus(setno) & MD_SET_BADTAG)
		md_clr_setstatus(setno, MD_SET_BADTAG);

	if (err = dt_write(s)) {
		err = mdsyserror(ep, err);
		goto out;
	}

	mddb_setexit(s);

	s = NULL;

	/* shorthand */
	setno = cp->c_setno;

	/* Clear the keeptag */
	md_clr_setstatus(setno, MD_SET_KEEPTAG);

	/* Release the set */
	if (err = release_set(cp, mode))
		goto out;

	if (! mdisok(&cp->c_mde)) {
		(void) mdstealerror(ep, &cp->c_mde);
		goto out;
	}

	/* Re-init set using the saved mddb_config_t structure */
	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
			err = mddbstatus2error(ep, err, NODEV32, setno);
			goto out;
		}
	}

	ASSERT(s->s_rip == (mddb_ri_t *)NULL);

	/* Free the allocated rip structure */
	if (s->s_rip != (mddb_ri_t *)NULL)
		free_rip(&s->s_rip);

	/* use the saved rip structure */
	s->s_rip = trip;
	trip = (mddb_ri_t *)NULL;

	/* Let the set init code know an accept is in progress */
	md_set_setstatus(setno, MD_SET_ACCEPT);

	mddb_setexit(s);

	s = NULL;

	/* Take the set */
	if (err = take_set(cp, mode))
		goto out;

	if (! mdisok(&cp->c_mde))
		(void) mdstealerror(ep, &cp->c_mde);

out:
	md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT));

	kmem_free(cp, sizeof (mddb_config_t));

	if (trip)
		free_rip(&trip);

	if (s)
		mddb_setexit(s);

	return (err);
}

/*
 * mddb_getinvlb_devid - cycles through the locator block and determines
 *		if the device id's for any of the replica disks are invalid.
 *		If so, it returns the diskname in the ctdptr.
 *	RETURN
 *		-1	Error
 *		cnt	number of invalid device id's
 */
int
mddb_getinvlb_devid(
	set_t	setno,
	int	count,
	int	size,
	char	**ctdptr
)
{
	mddb_set_t	*s;
	int		err = 0;
	mddb_lb_t	*lbp;
	int		li;
	mddb_did_blk_t	*did_blk;
	mddb_did_info_t	*did_info;
	int		len;
	int		cnt = 0;
	char		*cptr;
	md_name_suffix	*sn;
	int		i, dont_add_it;
	char		*tmpctd, *diskname;
	char		*tmpname;

	cptr = *ctdptr;
	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
		return (-1);
	}

	single_thread_start(s);
	lbp = s->s_lbp;

	if (lbp->lb_setno != setno) {
		single_thread_end(s);
		mddb_setexit(s);
		return (-1);
	}

	/* check for lb being devid style */
	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
		did_blk = s->s_did_icp->did_ic_blkp;
		for (li = 0; li < lbp->lb_loccnt; li++) {
			did_info = &(did_blk->blk_info[li]);
			/* Only if devid exists and isn't valid */
			if ((did_info->info_flags & MDDB_DID_EXISTS) &&
			    !(did_info->info_flags & MDDB_DID_VALID)) {
				/*
				 * if we count more invalid did's than
				 * was passed in there's an error somewhere
				 */
				if (cnt++ > count) {
					single_thread_end(s);
					mddb_setexit(s);
					return (-1);
				}

				/*
				 * Future note: Need to do something here
				 * for the MN diskset case when device ids
				 * are supported in disksets.
				 * Can't add until merging devids_in_diskset
				 * code into code base.
				 */

				sn = &s->s_lnp->ln_suffixes[0][li];
				/*
				 * check to make sure length of device name is
				 * not greater than computed first time through
				 */
				len = sn->suf_len;
				if (len > size) {
					single_thread_end(s);
					mddb_setexit(s);
					return (-1);
				}
				tmpctd = *ctdptr;
				/* strip off slice part */
				diskname = md_strdup(sn->suf_data);
				tmpname = strrchr(diskname, 's');
				*tmpname = '\0';
				dont_add_it = 0;
				/* look to see if diskname is already in list */
				for (i = 0; i < (cnt-1); i++) {
					if (strcmp(diskname, tmpctd) == 0) {
						/* already there, don't add */
						dont_add_it = 1;
						break;
					}
					/* point to next diskname in list */
					tmpctd += size;
				}
				if (dont_add_it == 0) {
					/* add diskname to list */
					(void) strcpy(cptr, diskname);
					cptr += size;
				}
				kmem_free(diskname, strlen(sn->suf_data) + 1);
			}
		}
	}
	/* null terminate the list */
	*cptr = '\0';
	/*
	 * need to save the new pointer so that calling routine can continue
	 * to add information onto the end.
	 */
	*ctdptr = cptr;
	single_thread_end(s);
	mddb_setexit(s);
	return (cnt);
}

/*
 * mddb_validate_lb - count the number of lb's with invalid device id's. Keep
 *		track of length of longest devicename.
 *	RETURN
 *		-1	error
 *		 cnt	number of lb's with invalid devid's
 */
int
mddb_validate_lb(
	set_t	setno,
	int	*rmaxsz
)
{
	mddb_set_t	*s;
	int		err = 0;
	mddb_lb_t	*lbp;
	int		li;
	mddb_did_blk_t	*did_blk;
	mddb_did_info_t	*did_info;
	int		len;
	int		cnt = 0;

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
		return (-1);

	single_thread_start(s);
	lbp = s->s_lbp;

	if (lbp->lb_setno != setno) {
		single_thread_end(s);
		mddb_setexit(s);
		return (-1);
	}

	/* lb must be in devid style */
	if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0)
		goto mvl_out;

	did_blk = s->s_did_icp->did_ic_blkp;
	for (li = 0; li < lbp->lb_loccnt; li++) {
		char		*minor_name;
		mddb_locator_t	*lp;
		dev_t		ddi_dev;
		ddi_devid_t	devid;
		ddi_devid_t	rtn_devid = NULL;
		int		get_rval;

		did_info = &(did_blk->blk_info[li]);
		if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) ||
		    (did_info->info_flags & MDDB_DID_VALID))
			continue;

		/* Here we know, did exists but isn't valid */

		lp = &lbp->lb_locators[li];
		ddi_dev = expldev(lp->l_dev);
		get_rval = mddb_devid_get(s, li, &devid, &minor_name);
		ASSERT(get_rval == 1);
		if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) &&
		    (ddi_devid_compare(rtn_devid, devid) == 0)) {
			did_info->info_flags = MDDB_DID_VALID |
			    MDDB_DID_EXISTS | MDDB_DID_UPDATED;
		} else {
			cnt++;
			/*
			 * Future note: Need to do something here
			 * for the MN diskset case when device ids
			 * are supported in disksets.
			 * Can't add until merging devids_in_diskset
			 * code into code base.
			 */
			len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len;
			if (*rmaxsz < len)
				*rmaxsz = len;
		}
		if (rtn_devid != NULL)
			ddi_devid_free(rtn_devid);
	}

mvl_out:

	if (push_lb(s) != 0)
		cnt = -1;
	(void) upd_med(s, "mddb_validate_lb(0)");
	single_thread_end(s);
	mddb_setexit(s);
	return (cnt);
}

int
check_active_locators()
{
	mddb_set_t	*s;
	mddb_lb_t	*lbp;
	int		li;
	int		active = 0;

	mutex_enter(&mddb_lock);
	/* there is nothing here..so we can unload */
	if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) {
		mutex_exit(&mddb_lock);
		return (0);
	}
	s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db;
	lbp = s->s_lbp;
	if (lbp == NULL) {
		mutex_exit(&mddb_lock);
		return (0);
	}

	for (li = 0; li < lbp->lb_loccnt; li++) {
		mddb_locator_t *lp = &lbp->lb_locators[li];
		if (lp->l_flags & MDDB_F_ACTIVE) {
			active = 1;
			break;
		}
	}
	mutex_exit(&mddb_lock);
	return (active);
}

/*
 * regetoptrecord:
 * --------------
 *	Update the in-core optimized resync record contents by re-reading the
 *	record from the on-disk metadb.
 *	The contents of the resync record will be overwritten by calling this
 *	routine. This means that callers that require the previous contents to
 *	be preserved must save the data before calling this routine.
 *	Return values:
 *	0 - successfully read in resync record from a mddb
 *	1 - failure.  Unable to read resync record from either mddb.
 */
static int
regetoptrecord(
	mddb_set_t	*s,
	mddb_de_ic_t	*dep
)
{
	mddb_lb_t	*lbp;
	mddb_locator_t	*lp;
	mddb_rb32_t	*rbp, *crbp;
	int		li;
	int		i;
	int		err = 0;
	size_t		recsize;

#if defined(_ILP32) && !defined(lint)
	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
#endif

	recsize = dep->de_recsize;
	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);

	single_thread_start(s);
	rbp = dep->de_rb;

	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;

	lbp = s->s_lbp;

	for (i = 0; i < 2; i++) {
		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
			continue;
		li = dep->de_optinfo[i].o_li;
		lp = &lbp->lb_locators[li];

		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
		    (lp->l_flags & MDDB_F_EMASTER))
			continue;

		/*
		 * re-read the optimized resync record with failfast set
		 * since a failed disk could lead to a very long wait.
		 */
		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
		    dep->de_blkcount, li, B_FAILFAST);

		if (err)
			continue;

		if (rbp->rb_magic != MDDB_MAGIC_RB)
			continue;

		if (revchk(MDDB_REV_RB, rbp->rb_revision))
			continue;

		/* Check the crc for this record */
		if (rec_crcchk(s, dep, rbp)) {
			continue;
		}
		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;

		if (rbp == crbp) {
			if (rbp->rb_checksum != crbp->rb_checksum)
				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
			break;
		}
		rbp = crbp;
	}

	single_thread_end(s);

	if (rbp == crbp) {
		rbp->rb_private = 0;
		kmem_free((caddr_t)crbp, recsize);
		return (0);
	}
	uniqtime32(&rbp->rb_timestamp);
	/* Generate the crc for this record */
	rec_crcgen(s, dep, rbp);
	kmem_free((caddr_t)crbp, recsize);
	return (1);
}

/*
 * mddb_reread_rr:
 *	Re-read the resync record from the on-disk copy. This is required for
 *	multi-node support so that a new mirror-owner can determine if a resync
 *	operation is required to guarantee data integrity.
 *
 * Arguments:
 *	setno	Associated set
 *	id	Resync record ID
 *
 * Return Value:
 *	0	successful reread
 *	-1	invalid set (not multi-node or non-existant)
 *	>0	metadb state invalid, failed to reread
 */
int
mddb_reread_rr(
	set_t		setno,
	mddb_recid_t	id
)
{
	mddb_set_t	*s;
	int		err = 0;
	mddb_db_t	*dbp;
	mddb_de_ic_t	*dep;

	if (setno >= md_nsets)
		return (-1);

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
		return (-1);

	if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) {
		mddb_setexit(s);
		return (-1);
	}

	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		dep = dbp->db_firstentry;
		while (dep && (dep->de_recid != DBID(id)))
			dep = dep->de_next;
		if (dep != NULL)
			break;
	}

	if (dep != NULL) {
		err = regetoptrecord(s, dep);
	} else {
		err = -1;
	}
	mddb_setexit(s);
	return (err);
}

/*
 * Set owner associated with MN optimized resync record.
 *
 * Optimized records have an owner node associated with them in
 * a MN diskset.  The owner is only set on a node that is actively
 * writing to that record.  The other nodes will show that record
 * as having an invalid owner.  The owner for an optimized record
 * is used during fixoptrecord to determine which node should
 * write out the record when the replicas associated with that
 * optimized record have been changed.
 *
 * Called directly from mirror driver and not from an ioctl.
 *
 * Returns
 *	NULL if successful.
 *	MDDB_E_NORECORD if record not found.
 */
int
mddb_setowner(
	mddb_recid_t		id,
	md_mn_nodeid_t		owner
)
{
	mddb_set_t		*s;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	int			found = 0;


	if (DBSET(id) >= md_nsets)
		return (MDDB_E_NORECORD);

	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
		return (MDDB_E_NORECORD);

	id = DBID(id);
	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry;
		    dep != NULL; dep = dep->de_next) {
			if (dep->de_recid != id)
				continue;
			dep->de_owner_nodeid = owner;
			found = 1;
			break;
		}
		if (found)
			break;
	}

	mddb_setexit(s);

	if (!found) {
		return (MDDB_E_NORECORD);
	}

	return (NULL);
}

/*
 * mddb_parse re-reads portions of the mddb from disk given a list
 * of good replicas to read from and flags describing
 * which portion of the mddb to read in.
 *
 * Used in a MN diskset when the master has made a change to some part
 * of the mddb and wants to relay this information to the slaves.
 */
int
mddb_parse(mddb_parse_parm_t *mpp)
{
	mddb_set_t	*s;
	int		err = 0;
	mddb_locator_t	*lp, *old_lp;
	mddb_lb_t	*lbp, *old_lbp;
	int		rval = 0;
	int		i, li;
	int		found_good_one = 0;
	mddb_ln_t	*lnp;
	mddb_block_t	ln_blkcnt;
	md_error_t	*ep = &mpp->c_mde;

	if (mpp->c_setno >= md_nsets)
		return (EINVAL);

	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
		return (0);

	if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
		return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno));
	}

	if (!(MD_MNSET_SETNO(mpp->c_setno))) {
		mddb_setexit_no_parse(s);
		return (EINVAL);
	}

	/*
	 * Master node initiated this request, so there's no work for
	 * the master node to do.
	 */
	if (md_set[mpp->c_setno].s_am_i_master) {
		mddb_setexit_no_parse(s);
		return (rval);
	}

	single_thread_start(s);

	if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) {
		lbp = 0;
		for (i = 0; i < MDDB_NLB; i++) {
			/* Walk through master's active list */
			if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE))
				continue;
			if (s->s_mbiarray[i] == NULL)
				continue;

			/* Assumes master blocks are already setup */
			if (lbp == (mddb_lb_t *)NULL) {
				lbp = (mddb_lb_t *)kmem_zalloc(
				    dbtob(MDDB_MNLBCNT), KM_SLEEP);
			}
			err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i);

			if (err)
				continue;

			if (lbp->lb_magic != MDDB_MAGIC_LB)
				continue;
			if (lbp->lb_blkcnt != MDDB_MNLBCNT)
				continue;
			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
				continue;
			if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT),
			    NULL))
				continue;
			if (lbp->lb_setno != s->s_setno)
				continue;
			/*
			 * a commit count of zero means this locator has
			 * been deleted
			 */
			if (lbp->lb_commitcnt == 0) {
				continue;
			}
			/* Found a good locator - keep it */
			found_good_one = 1;
			break;
		}

		/*
		 * If found a good copy of the mddb, then read it into
		 * this node's locator block.  Fix up the set's s_mbiarray
		 * pointer (master block incore array pointer) to be
		 * in sync with the newly read in locator block.  If a
		 * new mddb was added, read in the master blocks associated
		 * with the new mddb.  If an mddb was deleted, free the
		 * master blocks associated with deleted mddb.
		 */
		if (found_good_one)  {
			/* Compare old and new view of mddb locator blocks */
			old_lbp = s->s_lbp;
			for (li = 0; li < lbp->lb_loccnt; li++) {
				int	mn_set;

				lp = &lbp->lb_locators[li];
				old_lp = &old_lbp->lb_locators[li];

				/* If old and new views match, continue */
				if ((lp->l_flags & MDDB_F_ACTIVE) ==
				    (old_lp->l_flags & MDDB_F_ACTIVE))
					continue;

				if (lp->l_flags & MDDB_F_ACTIVE) {
					/*
					 * If new mddb has been added - delete
					 * old mbiarray and get new one.
					 *
					 * When devids are supported, will
					 * need to get dev from devid.
					 */
					if (s->s_mbiarray[li]) {
						free_mbipp(&s->s_mbiarray[li]);
					}
					/*
					 * If getmasters fails, getmasters
					 * will set appropriate error flags.
					 */
					s->s_mbiarray[li] = getmasters(s,
					    md_expldev(lp->l_dev), lp->l_blkno,
					    (uint_t *)&(lp->l_flags), &mn_set);
				} else if (lp->l_flags & MDDB_F_DELETED) {
					/*
					 * If old one has been deleted -
					 * delete old mbiarray.
					 */
					if (s->s_mbiarray[li]) {
						free_mbipp(&s->s_mbiarray[li]);
					}
				}
			}

			/* Free this node's old view of mddb locator blocks */
			kmem_free((caddr_t)s->s_lbp,
			    dbtob(s->s_lbp->lb_blkcnt));
			s->s_lbp = lbp;
		} else {
			if (lbp)
				kmem_free(lbp, dbtob(MDDB_MNLBCNT));
		}
	}

	if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) {
		lnp = s->s_lnp;
		lbp = s->s_lbp;
		ln_blkcnt = lbp->lb_lnblkcnt;
		s->s_lnp = NULL; /* readlocnames does this anyway */
		for (li = 0; li < lbp->lb_loccnt; li++) {
			lp = &lbp->lb_locators[li];

			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
			    (lp->l_flags & MDDB_F_EMASTER))
				continue;

			/* Successfully read the locator names */
			if (readlocnames(s, li) == 0)
				break;
		}

		if (li == lbp->lb_loccnt) {
			/* Did not successfully read locnames; restore lnp */
			s->s_lnp = lnp;
		} else {
			/* readlocnames successful, free old struct */
			kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
		}
	}

	if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) {
		mddb_de_ic_t	*dep, *tdep, *first_dep, *dep2;
		mddb_db_t	*dbp;
		mddb_db32_t	*db32p;
		mddb_de32_t	*de32p, *de32p2;
		int		writeout;

		lbp = s->s_lbp;
		/*
		 * Walk through directory block and directory entry incore
		 * linked list looking for optimized resync records.
		 * For each opt record found, re-read in directory block.
		 * The directoy block consists of a number of directory
		 * entries.  The directory entry for this opt record will
		 * describe which 2 mddbs actually contain the resync record
		 * since it could have been relocated by the master node
		 * due to mddb failure or mddb deletion.  If this node
		 * is the record owner for this opt record, then write out
		 * the record to the 2 mddbs listed in the directory entry
		 * if the mddbs locations are different than previously known.
		 */
		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
			for (dep = dbp->db_firstentry; dep;
			    dep = dep->de_next) {
				/* Found an opt record */
				if (dep->de_flags & MDDB_F_OPT)
					break;
			}
			/* If no opt records found, go to next dbp */
			if (dep == NULL)
				continue;

			/*
			 * Reread directory block from disk since
			 * master could have rewritten in during fixoptrecord.
			 */
			db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE,
			    KM_SLEEP);
			create_db32rec(db32p, dbp);
			for (li = 0; li < lbp->lb_loccnt; li++) {
				lp = &lbp->lb_locators[li];

				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
				    (lp->l_flags & MDDB_F_EMASTER))
					continue;

				err = readblks(s, (caddr_t)db32p,
				    db32p->db32_blknum, 1, li);
				if (err)
					continue;

				/* Reverify db; go to next mddb if bad */
				if ((db32p->db32_magic != MDDB_MAGIC_DB) ||
				    (revchk(MDDB_REV_DB,
				    db32p->db32_revision)) ||
				    (crcchk(db32p, &db32p->db32_checksum,
				    MDDB_BSIZE, NULL))) {
					continue;
				} else {
					break;
				}
			}
			/*
			 * If all mddbs are unavailable then panic since
			 * this slave cannot be allowed to continue out-of-sync
			 * with the master node.  Since the optimized resync
			 * records are written by all nodes, all nodes must
			 * stay in sync with the master.
			 *
			 * This also handles the case when all storage
			 * connectivity to a slave node has failed.  The
			 * slave node will send an MDDB_OPTRECERR message to
			 * the master node when the slave node has been unable
			 * to write an optimized resync record to both
			 * designated mddbs.  After the master has fixed the
			 * optimized records to be on available mddbs, the
			 * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS)
			 * is sent to all slave nodes.  If a slave node is
			 * unable to access any mddb in order to read in the
			 * relocated optimized resync record, then the slave
			 * node must panic.
			 */
			if (li == lbp->lb_loccnt) {
				kmem_free((caddr_t)db32p, MDDB_BSIZE);
				cmn_err(CE_PANIC, "md: mddb: Node unable to "
				    "access any SVM state database "
				    "replicas for diskset %s\n", s->s_setname);
			}
			/*
			 * Setup temp copy of linked list of de's.
			 * Already have an incore copy, but need to walk
			 * the directory entry list contained in the
			 * new directory block that was just read in above.
			 * After finding the directory entry of an opt record
			 * by walking the incore list, find the corresponding
			 * entry in the temporary list and then update
			 * the incore directory entry record with
			 * the (possibly changed) mddb location stored
			 * for the optimized resync records.
			 */
			de32p = (mddb_de32_t *)
			    ((void *) ((caddr_t)
			    (&db32p->db32_firstentry)
			    + sizeof (db32p->db32_firstentry)));
			tdep = (mddb_de_ic_t *)
			    kmem_zalloc(sizeof (mddb_de_ic_t) -
			    sizeof (mddb_block_t) +
			    sizeof (mddb_block_t) *
			    de32p->de32_blkcount, KM_SLEEP);
			de32tode(de32p, tdep);
			first_dep = tdep;
			while (de32p && de32p->de32_next) {
				de32p2 = nextentry(de32p);
				dep2 = (mddb_de_ic_t *)kmem_zalloc(
				    sizeof (mddb_de_ic_t) -
				    sizeof (mddb_block_t) +
				    sizeof (mddb_block_t) *
				    de32p2->de32_blkcount, KM_SLEEP);
				de32tode(de32p2, dep2);
				tdep->de_next = dep2;
				tdep = dep2;
				de32p = de32p2;
			}

			/* Now, walk the incore directory entry list */
			for (dep = dbp->db_firstentry; dep;
			    dep = dep->de_next) {
				if (! (dep->de_flags & MDDB_F_OPT))
					continue;
				/*
				 * Found an opt record in the incore copy.
				 * Find the corresponding entry in the temp
				 * list.  If anything has changed in the
				 * opt record info between the incore copy
				 * and the temp copy, update the incore copy
				 * and set a flag to writeout the opt record
				 * to the new mddb locations.
				 */
				for (tdep = first_dep; tdep;
				    tdep = tdep->de_next) {
					if (dep->de_recid == tdep->de_recid) {
						writeout = 0;
						/* Check first mddb location */
						if ((dep->de_optinfo[0].o_li !=
						    tdep->de_optinfo[0].o_li) ||
						    (dep->de_optinfo[0].
						    o_flags != tdep->de_optinfo
						    [0].o_flags)) {
							dep->de_optinfo[0] =
							    tdep->de_optinfo[0];
							writeout = 1;
						}
						/* Check second mddb location */
						if ((dep->de_optinfo[1].o_li !=
						    tdep->de_optinfo[1].o_li) ||
						    (dep->de_optinfo[1].
						    o_flags != tdep->de_optinfo
						    [1].o_flags)) {
							dep->de_optinfo[1] =
							    tdep->de_optinfo[1];
							writeout = 1;
						}
						/*
						 * Record owner should rewrite
						 * it
						 */
						if ((writeout) &&
						    (dep->de_owner_nodeid ==
						    md_set[mpp->c_setno].
						    s_nodeid))
							(void) writeoptrecord(s,
							    dep);
						break;
					}
				}
			}
			/*
			 * Update the incore checksum information for this
			 * directory block to match the newly read in checksum.
			 * This should have only changed if the incore and
			 * temp directory entries differed, but it takes
			 * more code to do the check than to just update
			 * the information everytime.
			 */
			dbp->db_checksum = db32p->db32_checksum;

			/* Now free everything */
			tdep = first_dep;
			while (tdep) {
				dep2 = tdep->de_next;
				kmem_free((caddr_t)tdep,
				    sizeofde(tdep));
				tdep = dep2;
			}
			kmem_free((caddr_t)db32p, MDDB_BSIZE);
		}
		rval = 0;
	}
out:
	single_thread_end(s);
	mddb_setexit_no_parse(s);
	return (rval);
}

int
mddb_block(mddb_block_parm_t *mbp)
{
	mddb_set_t	*s;
	int		err = 0;
	md_error_t	*ep = &mbp->c_mde;

	if (mbp->c_setno >= md_nsets)
		return (EINVAL);

	/*
	 * If the new_master flag is set for this setno we are in the middle
	 * of a reconfig cycle, and blocking or unblocking is not needed.
	 * Hence we can return success immediately
	 */
	if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) {
		return (0);
	}

	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
		return (0);

	if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
		return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno));
	}

	if (!(MD_MNSET_SETNO(mbp->c_setno))) {
		mddb_setexit_no_parse(s);
		return (EINVAL);
	}

	single_thread_start(s);

	if (mbp->c_blk_flags & MDDB_BLOCK_PARSE)
		md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);

	if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE)
		md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);

	single_thread_end(s);
	mddb_setexit_no_parse(s);
	return (err);
}

/*
 * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords
 * to relocate any optimized resync records to available mddbs.
 * This routine is only called on the master node.
 *
 * Used in a MN diskset when a slave node has failed to write an optimized
 * resync record.  The failed mddb information is sent to the master node
 * so the master can relocate the optimized records, if possible.  If the
 * failed mddb information has a mddb marked as failed that was previously
 * marked active on the master, the master sets its incore mddb state to
 * EWRITE and sets the PARSE_LOCBLK flag.  The master node then attempts
 * to relocate any optimized records on the newly failed mddbs by calling
 * fixoptrecords.  (fixoptrecords will set the PARSE_OPTRECS flag if any
 * optimized records are relocated.)
 *
 * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE
 * flags and will send a PARSE message to the slave nodes.  The PARSE_LOCBLK
 * flag causes the slave node to re-read in the locator block from disk.
 * The PARSE_OPTRECS flag causes the slave node to re-read in the directory
 * blocks and write out any optimized resync records that have been
 * relocated to a different mddb.
 */
int
mddb_optrecfix(mddb_optrec_parm_t *mop)
{
	mddb_set_t		*s;
	int			err = 0;
	mddb_lb_t		*lbp;
	mddb_mnlb_t		*mnlbp;
	mddb_locator_t		*lp;
	int			li;
	mddb_mnsidelocator_t	*mnslp;
	mddb_drvnm_t		*dn;
	int			i, j;
	md_replica_recerr_t	*recerr;
	md_error_t		*ep = &mop->c_mde;
	int			something_changed = 0;
	int			alc, lc;
	int			setno;

	setno = mop->c_setno;
	if (mop->c_setno >= md_nsets)
		return (EINVAL);

	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
		return (0);

	if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
		return (mddbstatus2error(ep, err, NODEV32, mop->c_setno));
	}

	if (!(MD_MNSET_SETNO(mop->c_setno))) {
		mddb_setexit(s);
		return (EINVAL);
	}

	single_thread_start(s);
	lbp = s->s_lbp;
	mnlbp = (mddb_mnlb_t *)lbp;

	/*
	 * If slave node has seen an mddb failure, but the master node
	 * hasn't encountered this failure, mark the mddb as failed on
	 * the master node and set the something_changed flag to 1.
	 */
	for (i = 0; i < 2; i++) {
		recerr = &mop->c_recerr[i];
		if (recerr->r_flags & MDDB_F_EWRITE) {
			li = recerr->r_li;
			lp = &lbp->lb_locators[li];
			for (j = 0; j < MD_MNMAXSIDES; j++) {
				mnslp = &mnlbp->lb_mnsidelocators[j][li];
				if (mnslp->mnl_sideno == s->s_sideno)
					break;
			}
			/* Do quick check using li */
			if (j != MD_MNMAXSIDES)
				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];

			if ((j != MD_MNMAXSIDES) &&
			    (strncmp(dn->dn_data, recerr->r_driver_name,
			    MD_MAXDRVNM) == 0) &&
			    (recerr->r_blkno == lp->l_blkno) &&
			    (recerr->r_mnum == mnslp->mnl_mnum)) {
				if ((lp->l_flags & MDDB_F_ACTIVE) ||
				    ((lp->l_flags & MDDB_F_EWRITE) == 0)) {
					something_changed = 1;
					lp->l_flags |= MDDB_F_EWRITE;
					lp->l_flags &= ~MDDB_F_ACTIVE;
				}
			} else {
				/*
				 * Passed in li from slave does not match
				 * the replica in the master's structures.
				 * This could have occurred if a delete
				 * mddb command was running when the
				 * optimized resync record had a failure.
				 * Search all replicas for this entry.
				 * If no match, just ignore.
				 * If a match, set replica in error.
				 */
				for (li = 0; li < lbp->lb_loccnt; li++) {
					lp = &lbp->lb_locators[li];
					if (lp->l_flags & MDDB_F_DELETED)
						continue;

					for (j = 0; j < MD_MNMAXSIDES; j++) {
						mnslp =
						    &mnlbp->
						    lb_mnsidelocators[j][li];
						if (mnslp->mnl_sideno ==
						    s->s_sideno)
							break;
					}
					if (j == MD_MNMAXSIDES)
						continue;

					dn = &lbp->
					    lb_drvnm[mnslp->mnl_drvnm_index];
					if ((strncmp(dn->dn_data,
					    recerr->r_driver_name,
					    MD_MAXDRVNM) == 0) &&
					    (recerr->r_blkno == lp->l_blkno) &&
					    (recerr->r_mnum ==
					    mnslp->mnl_mnum)) {
						if ((lp->l_flags &
						    MDDB_F_ACTIVE) ||
						    ((lp->l_flags &
						    MDDB_F_EWRITE) == 0)) {
							something_changed = 1;
							lp->l_flags |=
							    MDDB_F_EWRITE;
							lp->l_flags &=
							    ~MDDB_F_ACTIVE;
						}
						break;
					}
				}
			}
		}
	}

	/*
	 * If this message changed nothing, then we're done since this
	 * failure has already been handled.
	 * If some mddb state has been changed, send a parse message to
	 * the slave nodes so that the slaves will re-read the locator
	 * block from disk.
	 */
	if (something_changed == 0) {
		single_thread_end(s);
		mddb_setexit(s);
		return (0);
	} else {
		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
	}

	/*
	 * Scan replicas setting MD_SET_TOOFEW if
	 * 50% or more of the mddbs have seen errors.
	 * Note: Don't call selectreplicas or writeretry
	 * since these routines may end up setting the ACTIVE flag
	 * on a failed mddb if the master is able to access the mddb
	 * but the slave node couldn't.  Need to have the ACTIVE flag
	 * turned off in order to relocate the optimized records to
	 * mddbs that are (hopefully) available on all nodes.
	 */
	alc = 0;
	lc = 0;
	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];
		if (lp->l_flags & MDDB_F_DELETED)
			continue;
		lc++;
		if (! (lp->l_flags & MDDB_F_ACTIVE))
			continue;
		alc++;
	}

	/*
	 * If more than 50% mddbs have failed, then don't relocate opt recs.
	 * The node sending the mddb failure information will detect TOOFEW
	 * and will panic when it attempts to re-write the optimized record.
	 */
	if (alc < ((lc + 1) / 2)) {
		md_set_setstatus(setno, MD_SET_TOOFEW);
		(void) push_lb(s);
		(void) upd_med(s, "mddb_optrecfix(0)");
		single_thread_end(s);
		mddb_setexit(s);
		return (0);
	}

	/* Attempt to relocate optimized records that are on failed mddbs */
	(void) fixoptrecords(s);

	/* Push changed locator block out to disk */
	(void) push_lb(s);
	(void) upd_med(s, "mddb_optrecfix(1)");

	/* Recheck for TOOFEW after writing out locator blocks */
	alc = 0;
	lc = 0;
	for (li = 0; li < lbp->lb_loccnt; li++) {
		lp = &lbp->lb_locators[li];
		if (lp->l_flags & MDDB_F_DELETED)
			continue;
		lc++;
		if (! (lp->l_flags & MDDB_F_ACTIVE))
			continue;
		alc++;
	}

	/* If more than 50% mddbs have failed, then don't relocate opt recs */
	if (alc < ((lc + 1) / 2)) {
		md_set_setstatus(setno, MD_SET_TOOFEW);
		single_thread_end(s);
		mddb_setexit(s);
		return (0);
	}

	single_thread_end(s);
	mddb_setexit(s);
	return (0);
}

/*
 * Check if incore mddb on master node matches ondisk mddb.
 * If not, master writes out incore view to all mddbs.
 * Have previously verified that master is an owner of the
 * diskset (master has snarfed diskset) and that diskset is
 * not stale.
 *
 * Meant to be called during reconfig cycle during change of master.
 * Previous master in diskset may have changed the mddb and
 * panic'd before relaying information to slave nodes.  New
 * master node just writes out its incore view of the mddb and
 * the replay of the change log will resync all the nodes.
 *
 * Only supported for MN disksets.
 *
 * Return values:
 *	0 - success
 *	non-zero - failure
 */
int
mddb_check_write_ioctl(mddb_config_t *info)
{
	int			err = 0;
	set_t			setno = info->c_setno;
	mddb_set_t		*s;
	int			li;
	mddb_locator_t		*lp;
	mddb_lb_t		*lbp;
	mddb_mnlb_t		*mnlbp_od;
	mddb_ln_t		*lnp;
	mddb_mnln_t		*mnlnp_od;
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	int			write_out_mddb;
	md_error_t		*ep = &info->c_mde;
	int			mddb_err = 0;
	int			prev_li = 0;
	int			rval = 0;
	int			alc, lc;
	int			mddbs_present = 0;

	/* Verify that setno is in valid range */
	if (setno >= md_nsets)
		return (EINVAL);

	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
		return (0);

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
		return (mddbstatus2error(ep, err, NODEV32, setno));
	}

	/* Calling diskset must be a MN diskset */
	if (!(MD_MNSET_SETNO(setno))) {
		mddb_setexit(s);
		return (EINVAL);
	}

	/* Re-verify that set is not stale */
	if (md_get_setstatus(setno) & MD_SET_STALE) {
		mddb_setexit(s);
		return (mdmddberror(ep, MDE_DB_STALE, NODEV32, setno));
	}

	lbp = s->s_lbp;
	lnp = s->s_lnp;

	/*
	 * Previous master could have died during the write of data to
	 * the mddbs so that the ondisk mddbs may not be consistent.
	 * So, need to check the contents of the first and last active mddb
	 * to see if the mddbs need to be rewritten.
	 */
	for (li = 0; li < lbp->lb_loccnt; li++) {
		int	checkcopy_err;

		lp = &lbp->lb_locators[li];
		/* Find replica that is active */
		if (lp->l_flags & MDDB_F_DELETED)
			continue;
		mddbs_present = 1;
		if (! (lp->l_flags & MDDB_F_ACTIVE))
			continue;
		if (s->s_mbiarray[li] == NULL)
			continue;
		/* Check locator block */
		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
		    KM_SLEEP);
		/* read in on-disk locator block */
		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);

		/* If err, try next mddb */
		if (err) {
			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
			continue;
		}

		/*
		 * We resnarf all changelog entries for this set.
		 * They may have been altered by the previous master
		 */
		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
			for (dep = dbp->db_firstentry; dep; dep =
			    dep->de_next) {
				if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
					continue;
				}
				/*
				 * This has been alloc'ed while
				 * joining the set
				 */
				if (dep->de_rb) {
					kmem_free(dep->de_rb, dep->de_recsize);
					dep->de_rb = (mddb_rb32_t *)NULL;
				}
				if (dep->de_rb_userdata) {
					kmem_free(dep->de_rb_userdata,
					    dep->de_reqsize);
					dep->de_rb_userdata = (caddr_t)NULL;
				}

				err = getrecord(s, dep, li);
				if (err) {
					/*
					 * When we see on error while reading
					 * the changelog entries, we move on
					 * to the next mddb
					 */
					err = 1;
					break; /* out of inner for-loop */
				}
				allocuserdata(dep);
			}
			if (err)
				break; /* out of outer for-loop */
		}

		/* If err, try next mddb */
		if (err) {
			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
			continue;
		}

		/* Is incore locator block same as ondisk? */
		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
		    == 1) {
			write_out_mddb = 1;
			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
			break;
		}

		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));

		/* If lb ok, check locator names */
		mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT),
		    KM_SLEEP);
		/* read in on-disk locator names */
		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
		    lbp->lb_lnblkcnt, li);

		/* If err, try next mddb */
		if (err) {
			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
			continue;
		}

		/* Are incore locator names same as ondisk? */
		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
		    == 1) {
			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
			write_out_mddb = 1;
			break;
		}

		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));

		/*
		 * Check records in mddb.
		 * If a read error is encountered, set the error flag and
		 * continue to the next mddb.  Otherwise, if incore data is
		 * different from ondisk, then set the flag to write out
		 * the mddb and break out.
		 */
		checkcopy_err = checkcopy(s, li);
		if (checkcopy_err == MDDB_F_EREAD) {
			lp->l_flags |= MDDB_F_EREAD;
			mddb_err = 1;
			continue;
		} else if (checkcopy_err == 1) {
			write_out_mddb = 1;
			break;
		}
		/*
		 * Have found first active mddb and the data is the same as
		 * incore - break out of loop
		 */
		write_out_mddb = 0;
		break;
	}

	/*
	 * Skip checking for last active mddb if:
	 *	- already found a mismatch in the first active mddb
	 *		(write_out_mddb is 1)  OR
	 * 	- didn't find a readable mddb when looking for first
	 *	  active mddb (there are mddbs present but all failed
	 *	  when read was attempted).
	 *
	 * In either case, go to write_out_mddb label in order to attempt
	 * to write out the data. If < 50% mddbs are available, panic.
	 */
	if ((write_out_mddb == 1) ||
	    ((li == lbp->lb_loccnt) && mddbs_present)) {
		write_out_mddb = 1;
		goto write_out_mddb;
	}

	/*
	 * Save which index was checked for the first active mddb.  If only 1
	 * active mddb, don't want to recheck the same mddb when looking for
	 * last active mddb.
	 */
	prev_li = li;

	/*
	 * Now, checking for last active mddb.  If found same index as before
	 * (only 1 active mddb), then skip.
	 */
	for (li = (lbp->lb_loccnt - 1); li >= 0; li--) {
		int	checkcopy_err;

		lp = &lbp->lb_locators[li];
		/* Find replica that is active */
		if (! (lp->l_flags & MDDB_F_ACTIVE))
			continue;
		if (lp->l_flags & MDDB_F_DELETED)
			continue;
		if (s->s_mbiarray[li] == NULL)
			continue;
		/* If already checked mddb, bail out */
		if (li == prev_li)
			break;
		/* Check locator block */
		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
		    KM_SLEEP);
		/* read in on-disk locator block */
		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);

		/* If err, try next mddb */
		if (err) {
			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
			continue;
		}


		/* Is incore locator block same as ondisk? */
		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
		    == 1) {
			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
			write_out_mddb = 1;
			break;
		}

		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));

		/* If lb ok, check locator names */
		mnlnp_od = (mddb_mnln_t *)
		    kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP);

		/* read in on-disk locator names */
		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
		    lbp->lb_lnblkcnt, li);

		/* If err, try next mddb */
		if (err) {
			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
			continue;
		}

		/* Are incore locator names same as ondisk? */
		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
		    == 1) {
			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
			write_out_mddb = 1;
			break;
		}

		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));

		/*
		 * Check records in mddb.
		 * If a read error is encountered, set the error flag and
		 * continue to the next mddb.  Otherwise, if incore data is
		 * different from ondisk, then set the flag to write out
		 * the mddb and break out.
		 */
		checkcopy_err = checkcopy(s, li);
		if (checkcopy_err == MDDB_F_EREAD) {
			lp->l_flags |= MDDB_F_EREAD;
			mddb_err = 1;
			continue;
		} else if (checkcopy_err == 1) {
			write_out_mddb = 1;
			break;
		}
		/*
		 * Have found last active mddb and the data is the same as
		 * incore - break out of loop
		 */
		write_out_mddb = 0;
		break;
	}

	/*
	 * If ondisk and incore versions of the mddb don't match, then
	 * write out this node's incore version to disk.
	 * Or, if unable to read a copy of the mddb, attempt to write
	 * out a new one.
	 */
write_out_mddb:
	if (write_out_mddb) {
		/* Recompute free blocks based on incore information */
		computefreeblks(s); /* set up free block bits */

		/*
		 * Write directory entries and record blocks.
		 * Use flag MDDB_WRITECOPY_SYNC so that writecopy
		 * routine won't write out change log records.
		 */
		for (li = 0; li < lbp->lb_loccnt; li++) {
			lp = &lbp->lb_locators[li];
			/* Don't write to inactive or deleted mddbs */
			if (! (lp->l_flags & MDDB_F_ACTIVE))
				continue;
			if (lp->l_flags & MDDB_F_DELETED)
				continue;
			if (s->s_mbiarray[li] == NULL)
				continue;
			/* If encounter a write error, save it for later */
			if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) {
				lp->l_flags |= MDDB_F_EWRITE;
				mddb_err = 1;
			}
		}

		/*
		 * Write out locator blocks to all replicas.
		 * push_lb will set MDDB_F_EWRITE on replicas that fail.
		 */
		if (push_lb(s))
			mddb_err = 1;
		(void) upd_med(s, "mddb_check_write_ioctl(0)");

		/* Write out locator names to all replicas */
		lnp = s->s_lnp;
		uniqtime32(&lnp->ln_timestamp);
		lnp->ln_revision = MDDB_REV_MNLN;
		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);

		/* writeall sets MDDB_F_EWRITE if writes fails to replica */
		if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
		    lbp->lb_lnblkcnt, 0))
			mddb_err = 1;

		/*
		 * The writes to the replicas above would have set
		 * the MDDB_F_EWRITE flags if any write error was
		 * encountered.
		 * If < 50% of the mddbs are available, panic.
		 */
		lc = alc = 0;
		for (li = 0; li < lbp->lb_loccnt; li++) {
			lp = &lbp->lb_locators[li];
			if (lp->l_flags & MDDB_F_DELETED)
				continue;
			lc++;
			/*
			 * If mddb:
			 *	- is not active (previously had an error)
			 *	- had an error reading the master blocks  or
			 *	- had an error in writing to the mddb
			 * then don't count this mddb in the active count.
			 */
			if (! (lp->l_flags & MDDB_F_ACTIVE) ||
			    (lp->l_flags & MDDB_F_EMASTER) ||
			    (lp->l_flags & MDDB_F_EWRITE))
				continue;
			alc++;
		}
		if (alc < ((lc + 1) / 2)) {
			cmn_err(CE_PANIC,
			    "md: Panic due to lack of DiskSuite state\n"
			    " database replicas. Fewer than 50%% of "
			    "the total were available,\n so panic to "
			    "ensure data integrity.");
		}
	}

	/*
	 * If encountered an error during checking or writing of
	 * mddbs, call selectreplicas so that replica error can
	 * be properly handled. This will involve another attempt
	 * to write the mddb out to any mddb marked MDDB_F_EWRITE.
	 * If mddb still fails, it will have the MDDB_F_ACTIVE bit
	 * turned off. Set the MDDB_SCANALLSYNC flag so that
	 * selectreplicas doesn't overwrite the change log entries.
	 *
	 * Set the PARSE_LOCBLK flag in the mddb_set structure to show
	 * that the locator block has been changed.
	 */
	if (mddb_err) {
		(void) selectreplicas(s, MDDB_SCANALLSYNC);
		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
	}

write_out_end:
	mddb_setexit(s);
	return (rval);
}

/*
 * Set/reset/get set flags in set structure.
 * Used during reconfig cycle
 * Only supported for MN disksets.
 *
 * Return values:
 *	0 - success
 *	non-zero - failure
 */
int
mddb_setflags_ioctl(mddb_setflags_config_t *info)
{
	set_t			setno = info->sf_setno;

	/* Verify that setno is in valid range */
	if (setno >= md_nsets)
		return (EINVAL);

	/*
	 * When setting the flags, the set may not
	 * be snarfed yet. So, don't check for SNARFED or MNset
	 * and don't call mddb_setenter.
	 * In order to discourage bad ioctl calls,
	 * verify that magic field in structure is set correctly.
	 */
	if (info->sf_magic != MDDB_SETFLAGS_MAGIC)
		return (EINVAL);

	switch (info->sf_flags) {
	case MDDB_NM_SET:
		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
			md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC);
		if (info->sf_setflags & MD_SET_MN_START_RC)
			md_set_setstatus(setno, MD_SET_MN_START_RC);
		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
			md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
		break;

	case MDDB_NM_RESET:
		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
			md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC);
		if (info->sf_setflags & MD_SET_MN_START_RC)
			md_clr_setstatus(setno, MD_SET_MN_START_RC);
		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
			md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
		break;

	case MDDB_NM_GET:
		info->sf_setflags = md_get_setstatus(setno) &
		    (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC|
		    MD_SET_MN_MIR_STATE_RC);
		break;
	}

	return (0);
}

/*
 * md_update_minor
 *
 * This function updates the minor in the namespace entry for an
 * underlying metadevice.  The function is called in mod_imp_set
 * where mod is sp, stripe, mirror and raid.
 *
 */
int
md_update_minor(
	set_t	setno,
	side_t	side,
	mdkey_t	key
)
{
	struct nm_next_hdr	*nh;
	struct nm_name		*n;
	char			*shn;
	int			retval = 1;
	side_t			s;

	/*
	 * Load the devid name space if it exists
	 */
	(void) md_load_namespace(setno, NULL, NM_DEVID);
	if (! md_load_namespace(setno, NULL, 0L)) {
		/*
		 * Unload the devid namespace
		 */
		(void) md_unload_namespace(setno, NM_DEVID);
		return (0);
	}

	rw_enter(&nm_lock.lock, RW_READER);

	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
		retval = 0;
		goto out;
	}

	/*
	 * Look up the key
	 */
	for (s = 0; s < MD_MAXSIDES; s++) {
		/*
		 * For side other than the import 'side', cleanup its entry
		 */
		if ((n = lookup_entry(nh, setno, s, key, NODEV64, 0L)) !=
		    NULL) {
			if (n->n_side == side) {
				/*
				 * Update its n_minor if metadevice
				 */
				if (((shn = (char *)getshared_name(setno,
				    n->n_drv_key, 0L)) != NULL) &&
				    (strcmp(shn, "md") == 0)) {
					n->n_minor = MD_MKMIN(setno,
					    MD_MIN2UNIT(n->n_minor));
				}
			} else {
				/* We are not the import side, cleanup */
				(void) remove_entry(nh, n->n_side, key, 0L);
			}
		}
	}

out:
	rw_exit(&nm_lock.lock);
	return (retval);
}

/*
 * md_update_top_device_minor
 *
 * This function updates the minor in the namespace entry for a top
 * level metadevice.  The function is called in mod_imp_set where
 * mod is sp, stripe, mirror and raid.
 *
 */
int
md_update_top_device_minor(
	set_t	setno,
	side_t	side,
	md_dev64_t dev
)
{
	struct nm_next_hdr	*nh;
	struct nm_name		*n;
	char			*shn;
	int			retval = 1;

	/*
	 * Load the devid name space if it exists
	 */
	(void) md_load_namespace(setno, NULL, NM_DEVID);
	if (! md_load_namespace(setno, NULL, 0L)) {
		/*
		 * Unload the devid namespace
		 */
		(void) md_unload_namespace(setno, NM_DEVID);
		return (0);
	}

	rw_enter(&nm_lock.lock, RW_READER);

	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
		retval = 0;
		goto out;
	}

	/*
	 * Look up the key
	 */
	if ((n = lookup_entry(nh, setno, side, MD_KEYWILD, dev, 0L)) != NULL) {
		/*
		 * Find the entry, update its n_minor if metadevice
		 */
		if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L))
		    == NULL) {
			retval = 0;
			goto out;
		}

		if (strcmp(shn, "md") == 0) {
			n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor));
		}
	}

out:
	rw_exit(&nm_lock.lock);
	return (retval);
}

static void
md_imp_nm(
	mddb_set_t	*s
)
{
	mddb_db_t		*dbp;
	mddb_de_ic_t		*dep;
	struct nm_rec_hdr	*hdr;
	struct nm_header	*hhdr;
	set_t			setno = s->s_setno;

	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry; dep != NULL;
		    dep = dep->de_next) {
			switch (dep->de_type1) {

			case MDDB_NM_HDR:
			case MDDB_DID_NM_HDR:

				hhdr = (struct nm_header *)
				    dep->de_rb_userdata;

				hdr = &hhdr->h_names;
				if (hdr->r_next_recid > 0) {
					hdr->r_next_recid = MAKERECID(setno,
					    DBID(hdr->r_next_recid));
				}

				hdr = &hhdr->h_shared;
				if (hdr->r_next_recid > 0) {
					hdr->r_next_recid = MAKERECID(setno,
					    DBID(hdr->r_next_recid));
				}
				break;

			case MDDB_NM:
			case MDDB_DID_NM:
			case MDDB_SHR_NM:
			case MDDB_DID_SHR_NM:

				hdr = (struct nm_rec_hdr *)
				    dep->de_rb_userdata;

				if (hdr->r_next_recid > 0) {
					hdr->r_next_recid = MAKERECID
					    (setno, DBID(hdr->r_next_recid));
				}
				break;

			default:
				break;
			}
		}
	}
}

static int
update_db_rec(
	mddb_set_t	*s
)
{
	mddb_db_t	*dbp;
	mddb_de_ic_t	*dep;
	mddb_recid_t	ids[2];

	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
		for (dep = dbp->db_firstentry; dep != NULL;
		    dep = dep->de_next) {
			if (! (dep->de_flags & MDDB_F_OPT)) {
				ids[0] = MAKERECID(s->s_setno, dep->de_recid);
				ids[1] = 0;
				if (mddb_commitrecs(ids)) {
					return (MDDB_E_NORECORD);
				}
			}
		}
	}
	return (0);
}

static int
update_mb(
	mddb_set_t	*s
)
{
	mddb_ri_t	*rip;
	int	err = 0;

	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
		if (rip->ri_flags & MDDB_F_EMASTER)
			/* disk is powered off or not there */
			continue;

		if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
			/*
			 * It is a replicated set
			 */
			if (rip->ri_devid == (ddi_devid_t)NULL) {
				return (-1);
			}
			err = update_mb_devid(s, rip, rip->ri_devid);
		} else {
			/*
			 * It is a non-replicated set
			 * and there is no need to update
			 * devid
			 */
			err = update_mb_devid(s, rip, NULL);
		}

		if (err)
			return (err);
	}

	return (0);
}

static int
update_setname(
	set_t	setno
)
{
	struct nm_next_hdr	*nh;
	struct nm_shared_name	*shn, *new_shn;
	char			*prefix = "/dev/md/";
	char			*shrname;
	int			len;
	mdkey_t			o_key;
	uint32_t		o_count, o_data;
	mddb_recid_t		recid, ids[3];
	int			err = 0;
	mddb_set_t		*dbp;

	/* Import setname */
	dbp = (mddb_set_t *)md_set[setno].s_db;
	len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1;
	shrname = kmem_zalloc(len, KM_SLEEP);
	(void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/");

	rw_enter(&nm_lock.lock, RW_WRITER);
	if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) {
		/*
		 * No namespace is okay
		 */
		err = 0;
		goto out;
	}

	if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh,
	    0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) {
		/*
		 * No metadevice is okay
		 */
		err = 0;
		goto out;
	}

	/*
	 * We have it, go ahead and update the namespace.
	 */
	o_key = shn->sn_key;
	o_count = shn->sn_count;
	o_data = shn->sn_data;

	if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED |
	    NM_NOCOMMIT | NM_KEY_RECYCLE)) {
		err = MDDB_E_NORECORD;
		goto out;
	}
	if ((new_shn = (struct nm_shared_name *)alloc_entry(
	    nh, md_set[setno].s_nmid, len, NM_SHARED |
	    NM_NOCOMMIT, &recid)) == NULL) {
		err = MDDB_E_NORECORD;
		goto out;
	}

	new_shn->sn_key = o_key;
	new_shn->sn_count = o_count;
	new_shn->sn_data = o_data;
	new_shn->sn_namlen = (ushort_t)len;
	(void) strcpy(new_shn->sn_name, shrname);

	ids[0] = recid;
	ids[1] = md_set[setno].s_nmid;
	ids[2] = 0;
	err = mddb_commitrecs(ids);

out:
	if (shrname)
		kmem_free(shrname, len);
	rw_exit(&nm_lock.lock);
	return (err);
}

/*
 * Returns 0 on success.
 * Returns -1 on failure with ep filled in.
 */
static int
md_imp_db(
	set_t		setno,
	int		stale_flag,
	md_error_t	*ep
)
{
	mddb_set_t	*s;
	int		err = 0;
	mddb_dt_t	*dtp;
	mddb_lb_t	*lbp;
	int		i;
	int		loccnt;

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
		return (mddbstatus2error(ep, err, NODEV32, setno));
	}

	/* Update dt */
	if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) {
		crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
	}

	if ((err = dt_write(s)) != 0) {
		err = mdsyserror(ep, err);
		mddb_setexit(s);
		return (err);
	}

	/*
	 * Update lb, no need to update the mediator because
	 * the diskset will only exist on the importing node
	 * and as such a mediator adds no value.
	 */

	/* Update lb */
	if (stale_flag & MD_IMP_STALE_SET) {
		lbp = s->s_lbp;
		loccnt = lbp->lb_loccnt;
		for (i = 0; i < loccnt; i++) {
			mddb_locator_t	*lp = &lbp->lb_locators[i];
			md_dev64_t	ndev = md_expldev(lp->l_dev);
			ddi_devid_t	devid_ptr;

			devid_ptr = s->s_did_icp->did_ic_devid[i];
			if (devid_ptr == NULL) {
				/*
				 * Already deleted, go to next one.
				 */
				continue;
			}
			if (mddb_devid_validate((ddi_devid_t)devid_ptr, &ndev,
			    NULL)) {
				/* disk unavailable, mark deleted */
				lp->l_flags = MDDB_F_DELETED;
				/* then remove the device id from the list */
				free_mbipp(&s->s_mbiarray[i]);
				(void) mddb_devid_delete(s, i);
			}
		}
		md_clr_setstatus(setno, MD_SET_STALE);
	}

	if ((err = writelocall(s)) != 0) {
		err = mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno);
		mddb_setexit(s);
		return (err);
	}

	mddb_setexit(s);

	/* Update db records */
	if ((err = update_db_rec(s)) != 0) {
		return (mddbstatus2error(ep, err, NODEV32, setno));
	}

	/* Update setname embedded in the namespace */
	if ((err = update_setname(setno)) != 0)
		return (mddbstatus2error(ep, err, NODEV32, setno));

	return (err);
}

static void
md_dr_add(
	md_set_record	*sr,
	md_drive_record	*dr
)
{
	md_drive_record	*drv;

	if (sr->sr_driverec == 0) {
		sr->sr_driverec = dr->dr_selfid;
		return;
	}

	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
	    drv->dr_nextrec != 0;
	    drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec))
		;
	drv->dr_nextrec = dr->dr_selfid;
}

static void
md_setup_recids(
	md_set_record	*sr,
	mddb_recid_t	**ids,
	size_t		size
)
{
	md_drive_record	*drv;
	int		cnt;
	mddb_recid_t	*recids;

	recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t)
	    * size, KM_SLEEP);
	recids[0] = sr->sr_selfid;
	cnt = 1;

	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
	    /* CSTYLED */
	    drv != NULL;) {
		recids[cnt++] = drv->dr_selfid;
		if (drv->dr_nextrec != 0)
			drv = (md_drive_record *)mddb_getrecaddr
			    (drv->dr_nextrec);
		else
			drv = NULL;
	}
	recids[cnt] = 0;
	*ids = &recids[0];
}

/*
 * The purpose of this function is to replace the old_devid with the
 * new_devid in the given namespace.   This is used for importing
 * remotely replicated drives.
 */
int
md_update_namespace_rr_did(
	mddb_config_t	*cp
)
{
	set_t			setno = cp->c_setno;
	struct nm_next_hdr	*nh;
	mdkey_t			key = MD_KEYWILD;
	side_t			side = MD_SIDEWILD;
	mddb_recid_t		recids[3];
	struct did_min_name	*n;
	struct nm_next_hdr	*did_shr_nh;
	struct did_shr_name	*shr_n;
	mdkey_t			ent_did_key;
	uint32_t		ent_did_count;
	uint32_t		ent_did_data;
	ddi_devid_t		devid = NULL;
	struct did_shr_name	*shn;
	void			*old_devid, *new_devid;

	if (!(md_get_setstatus(setno) & MD_SET_NM_LOADED))
		return (EIO);

	old_devid = (void *)(uintptr_t)cp->c_locator.l_old_devid;
	new_devid = (void *)(uintptr_t)cp->c_locator.l_devid;

	/*
	 * It is okay if we dont have any configuration
	 */
	if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED))
	    == NULL) {
		return (0);
	}
	while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) {
		/* check out every entry in the namespace */
		if ((n = (struct did_min_name *)lookup_entry(nh, setno,
		    side, key, NODEV64, NM_DEVID)) == NULL) {
			continue;
		} else {
			did_shr_nh = get_first_record(setno, 0, NM_DEVID |
			    NM_SHARED);
			if (did_shr_nh == NULL) {
				return (ENOENT);
			}

			shr_n = (struct did_shr_name *)lookup_shared_entry(
			    did_shr_nh, n->min_devid_key, (char *)0,
			    &recids[0], NM_DEVID);
			if (shr_n == NULL) {
				return (ENOENT);
			}
			rw_enter(&nm_lock.lock, RW_WRITER);
			devid = (ddi_devid_t)shr_n->did_devid;
			/* find this devid in the incore replica  */
			if (ddi_devid_compare(devid, old_devid) == 0) {
				/*
				 * found the corresponding entry
				 * update with new devid
				 */
				/* first remove old devid info */
				ent_did_key = shr_n ->did_key;
				ent_did_count = shr_n->did_count;
				ent_did_data = shr_n->did_data;
				(void) remove_shared_entry(did_shr_nh,
				    shr_n->did_key, NULL, NM_DEVID |
				    NM_IMP_SHARED | NM_KEY_RECYCLE);

				/* add in new devid info */
				if ((shn = (struct did_shr_name *)
				    alloc_entry(did_shr_nh,
				    md_set[setno].s_did_nmid,
				    cp->c_locator.l_devid_sz,
				    NM_DEVID | NM_SHARED | NM_NOCOMMIT,
				    &recids[0])) == NULL) {
						rw_exit(&nm_lock.lock);
						return (ENOMEM);
					}
					shn->did_key = ent_did_key;
					shn->did_count = ent_did_count;
					ent_did_data |= NM_DEVID_VALID;
					shn->did_data = ent_did_data;
					shn->did_size = ddi_devid_sizeof(
					    new_devid);
					bcopy((void *)new_devid, (void *)
					    shn->did_devid, shn->did_size);
					recids[1] = md_set[setno].s_nmid;
					recids[2] = 0;
					mddb_commitrecs_wrapper(recids);
			}
			rw_exit(&nm_lock.lock);
		}
	}

	return (0);
}

/*
 * namespace is loaded before this is called.
 * This function is a wrapper for md_update_namespace_rr_did.
 *
 * md_update_namespace_rr_did may be called twice if attempting to
 * resolve a replicated device id during the take of a diskset - once
 * for the diskset namespace and a second time for the local namespace.
 * The local namespace would need to be updated when a drive has been
 * found during a take of the diskset that hadn't been resolved during
 * the import (aka partial replicated import).
 *
 * If being called during the import of the diskset (IMPORT flag set)
 * md_update_namespace_rr_did will only be called once with the disket
 * namespace.
 */
int
md_update_nm_rr_did_ioctl(
	mddb_config_t	*cp
)
{
	int	rval = 0;

	/* If update of diskset namespace fails, stop and return failure */
	if ((rval = md_update_namespace_rr_did(cp)) != 0)
		return (rval);

	if (cp->c_flags & MDDB_C_IMPORT)
		return (0);

	/* If update of local namespace fails, return failure */
	cp->c_setno = MD_LOCAL_SET;
	rval = md_update_namespace_rr_did(cp);
	return (rval);
}

/*ARGSUSED*/
int
md_imp_snarf_set(
	mddb_config_t	*cp
)
{
	set_t		setno;
	int		stale_flag;
	mddb_set_t	*s;
	int		i, err = 0;
	md_ops_t	*ops;
	md_error_t	*ep = &cp->c_mde;

	setno = cp->c_setno;
	stale_flag = cp->c_flags;

	mdclrerror(ep);
	if (setno >= md_nsets) {
		return (mdsyserror(ep, EINVAL));
	}

	md_haltsnarf_enter(setno);
	if (md_get_setstatus(setno) & MD_SET_IMPORT) {
		goto out;
	}

	/* Set the bit first otherwise load_old_replicas can fail */
	md_set_setstatus(setno, MD_SET_IMPORT);

	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
		err = mddbstatus2error(ep, err, NODEV32, setno);
		goto out;
	}

	/*
	 * Upon completion of load_old_replicas, the old setno is
	 * restored from the disk so we need to reset
	 */
	s->s_lbp->lb_setno = setno;

	/*
	 * Fixup the NM records before loading namespace
	 */
	(void) md_imp_nm(s);
	mddb_setexit(s);

	/*
	 * Load the devid name space if it exists
	 * and ask each module to fixup unit records
	 */
	if (!md_load_namespace(setno, NULL, NM_DEVID)) {
		err = mdsyserror(ep, ENOENT);
		goto cleanup;
	}
	if (!md_load_namespace(setno, NULL, 0L)) {
		(void) md_unload_namespace(setno, NM_DEVID);
		err = mdsyserror(ep, ENOENT);
		goto cleanup;
	}

	do {
		i = 0;
		for (ops = md_opslist; ops != NULL; ops = ops->md_next)
			if (ops->md_imp_set != NULL)
				i += ops->md_imp_set(setno);
	} while (i);

	/*
	 * Fixup
	 *	(1) locator block
	 *	(2) locator name block if necessary
	 *	(3) master block
	 *	(4) directory block
	 * calls appropriate writes to push changes out
	 */
	if ((err = md_imp_db(setno, stale_flag, ep)) != 0) {
		goto cleanup;
	}

	/*
	 * Don't unload namespace if importing a replicated diskset.
	 * Namespace will be unloaded with an explicit RELEASE_SET ioctl.
	 */
	if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
		md_haltsnarf_exit(setno);
		return (err);
	}

cleanup:
	/*
	 * Halt the set
	 */
	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
	(void) md_halt_set(setno, MD_HALT_ALL);
	rw_exit(&md_unit_array_rw.lock);

	/*
	 * Unload the namespace for the imported set
	 */
	mutex_enter(&mddb_lock);
	mddb_unload_set(setno);
	mutex_exit(&mddb_lock);

out:
	md_haltsnarf_exit(setno);
	md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
	return (err);
}
#endif	/* MDDB_FAKE */