/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/types.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/stat.h>
#include <sys/sunddi.h>
#include <sys/ddi_impldefs.h>
#include <sys/obpdefs.h>
#include <sys/cmn_err.h>
#include <sys/errno.h>
#include <sys/kmem.h>
#include <sys/open.h>
#include <sys/thread.h>
#include <sys/cpuvar.h>
#include <sys/x_call.h>
#include <sys/debug.h>
#include <sys/sysmacros.h>
#include <sys/ivintr.h>
#include <sys/intr.h>
#include <sys/intreg.h>
#include <sys/autoconf.h>
#include <sys/modctl.h>
#include <sys/spl.h>
#include <sys/async.h>
#include <sys/mc.h>
#include <sys/mc-us3.h>
#include <sys/cpu_module.h>
#include <sys/platform_module.h>

/*
 * Function prototypes
 */

static int mc_open(dev_t *, int, int, cred_t *);
static int mc_close(dev_t, int, int, cred_t *);
static int mc_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
static int mc_attach(dev_info_t *, ddi_attach_cmd_t);
static int mc_detach(dev_info_t *, ddi_detach_cmd_t);

/*
 * Configuration data structures
 */
static struct cb_ops mc_cb_ops = {
	mc_open,			/* open */
	mc_close,			/* close */
	nulldev,			/* strategy */
	nulldev,			/* print */
	nodev,				/* dump */
	nulldev,			/* read */
	nulldev,			/* write */
	mc_ioctl,			/* ioctl */
	nodev,				/* devmap */
	nodev,				/* mmap */
	nodev,				/* segmap */
	nochpoll,			/* poll */
	ddi_prop_op,			/* cb_prop_op */
	0,				/* streamtab */
	D_MP | D_NEW | D_HOTPLUG,	/* Driver compatibility flag */
	CB_REV,				/* rev */
	nodev,				/* cb_aread */
	nodev				/* cb_awrite */
};

static struct dev_ops mc_ops = {
	DEVO_REV,			/* rev */
	0,				/* refcnt  */
	ddi_getinfo_1to1,		/* getinfo */
	nulldev,			/* identify */
	nulldev,			/* probe */
	mc_attach,			/* attach */
	mc_detach,			/* detach */
	nulldev,			/* reset */
	&mc_cb_ops,			/* cb_ops */
	(struct bus_ops *)0,		/* bus_ops */
	nulldev,			/* power */
	ddi_quiesce_not_needed,			/* quiesce */
};

/*
 * Driver globals
 */
static void *mcp;
static int nmcs = 0;
static int seg_id = 0;
static int nsegments = 0;
static uint64_t memsize = 0;
static int maxbanks = 0;

static mc_dlist_t *seg_head, *seg_tail, *bank_head, *bank_tail;
static mc_dlist_t *mctrl_head, *mctrl_tail, *dgrp_head, *dgrp_tail;
static mc_dlist_t *device_head, *device_tail;

static kmutex_t	mcmutex;
static kmutex_t	mcdatamutex;

static krwlock_t mcdimmsids_rw;

/* pointer to cache of DIMM serial ids */
static dimm_sid_cache_t	*mc_dimm_sids;
static int		max_entries;

extern struct mod_ops mod_driverops;

static struct modldrv modldrv = {
	&mod_driverops,			/* module type, this one is a driver */
	"Memory-controller",		/* module name */
	&mc_ops,			/* driver ops */
};

static struct modlinkage modlinkage = {
	MODREV_1,		/* rev */
	(void *)&modldrv,
	NULL
};

static int mc_get_mem_unum(int synd_code, uint64_t paddr, char *buf,
    int buflen, int *lenp);
static int mc_get_mem_info(int synd_code, uint64_t paddr,
    uint64_t *mem_sizep, uint64_t *seg_sizep, uint64_t *bank_sizep,
    int *segsp, int *banksp, int *mcidp);
static int mc_get_mem_sid(int mcid, int dimm, char *buf, int buflen, int *lenp);
static int mc_get_mem_offset(uint64_t paddr, uint64_t *offp);
static int mc_get_mem_addr(int mcid, char *sid, uint64_t off, uint64_t *paddr);
static int mc_init_sid_cache(void);
static int mc_get_mcregs(struct mc_soft_state *);
static void mc_construct(int mc_id, void *dimminfop);
static int mlayout_add(int mc_id, int bank_no, uint64_t reg, void *dimminfop);
static void mlayout_del(int mc_id, int delete);
static struct seg_info *seg_match_base(u_longlong_t base);
static void mc_node_add(mc_dlist_t *node, mc_dlist_t **head, mc_dlist_t **tail);
static void mc_node_del(mc_dlist_t *node, mc_dlist_t **head, mc_dlist_t **tail);
static mc_dlist_t *mc_node_get(int id, mc_dlist_t *head);
static void mc_add_mem_unum_label(char *buf, int mcid, int bank, int dimm);
static int mc_populate_sid_cache(void);
static int mc_get_sid_cache_index(int mcid);
static void mc_update_bank(struct bank_info *bank);

#pragma weak p2get_mem_unum
#pragma weak p2get_mem_info
#pragma weak p2get_mem_sid
#pragma weak p2get_mem_offset
#pragma	weak p2get_mem_addr
#pragma weak p2init_sid_cache
#pragma weak plat_add_mem_unum_label
#pragma weak plat_alloc_sid_cache
#pragma weak plat_populate_sid_cache

#define	QWORD_SIZE		144
#define	QWORD_SIZE_BYTES	(QWORD_SIZE / 8)

/*
 * These are the module initialization routines.
 */

int
_init(void)
{
	int error;

	if ((error = ddi_soft_state_init(&mcp,
	    sizeof (struct mc_soft_state), 1)) != 0)
		return (error);

	error =  mod_install(&modlinkage);
	if (error == 0) {
		mutex_init(&mcmutex, NULL, MUTEX_DRIVER, NULL);
		mutex_init(&mcdatamutex, NULL, MUTEX_DRIVER, NULL);
		rw_init(&mcdimmsids_rw, NULL, RW_DRIVER, NULL);
	}

	return (error);
}

int
_fini(void)
{
	int error;

	if ((error = mod_remove(&modlinkage)) != 0)
		return (error);

	ddi_soft_state_fini(&mcp);
	mutex_destroy(&mcmutex);
	mutex_destroy(&mcdatamutex);
	rw_destroy(&mcdimmsids_rw);

	if (mc_dimm_sids)
		kmem_free(mc_dimm_sids, sizeof (dimm_sid_cache_t) *
		    max_entries);

	return (0);
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&modlinkage, modinfop));
}

static int
mc_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
{
	struct mc_soft_state *softsp;
	struct dimm_info *dimminfop;
	int instance, len, err;

	/* get the instance of this devi */
	instance = ddi_get_instance(devi);

	switch (cmd) {
	case DDI_ATTACH:
		break;

	case DDI_RESUME:
		/* get the soft state pointer for this device node */
		softsp = ddi_get_soft_state(mcp, instance);
		DPRINTF(MC_ATTACH_DEBUG, ("mc%d: DDI_RESUME: updating MADRs\n",
		    instance));
		/*
		 * During resume, the source and target board's bank_infos
		 * need to be updated with the new mc MADR values.  This is
		 * implemented with existing functionality by first removing
		 * the props and allocated data structs, and then adding them
		 * back in.
		 */
		if (ddi_prop_exists(DDI_DEV_T_ANY, softsp->dip,
		    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS,
		    MEM_CFG_PROP_NAME) == 1) {
			(void) ddi_prop_remove(DDI_DEV_T_NONE, softsp->dip,
			    MEM_CFG_PROP_NAME);
		}
		mlayout_del(softsp->portid, 0);
		if (mc_get_mcregs(softsp) == -1) {
			cmn_err(CE_WARN, "mc_attach: mc%d DDI_RESUME failure\n",
			    instance);
		}
		return (DDI_SUCCESS);

	default:
		return (DDI_FAILURE);
	}

	if (ddi_soft_state_zalloc(mcp, instance) != DDI_SUCCESS)
		return (DDI_FAILURE);

	softsp = ddi_get_soft_state(mcp, instance);

	/* Set the dip in the soft state */
	softsp->dip = devi;

	if ((softsp->portid = (int)ddi_getprop(DDI_DEV_T_ANY, softsp->dip,
	    DDI_PROP_DONTPASS, "portid", -1)) == -1) {
		DPRINTF(MC_ATTACH_DEBUG, ("mc%d: unable to get %s property",
		    instance, "portid"));
		goto bad;
	}

	DPRINTF(MC_ATTACH_DEBUG, ("mc%d ATTACH: portid %d, cpuid %d\n",
	    instance, softsp->portid, CPU->cpu_id));

	/* map in the registers for this device. */
	if (ddi_map_regs(softsp->dip, 0, (caddr_t *)&softsp->mc_base, 0, 0)) {
		DPRINTF(MC_ATTACH_DEBUG, ("mc%d: unable to map registers",
		    instance));
		goto bad;
	}

	/*
	 * Get the label of dimms and pin routing information at memory-layout
	 * property if the memory controller is enabled.
	 *
	 * Basically every memory-controller node on every machine should
	 * have one of these properties unless the memory controller is
	 * physically not capable of having memory attached to it, e.g.
	 * Excalibur's slave processor.
	 */
	err = ddi_getlongprop(DDI_DEV_T_ANY, softsp->dip, DDI_PROP_DONTPASS,
	    "memory-layout", (caddr_t)&dimminfop, &len);
	if (err == DDI_PROP_SUCCESS) {
		/*
		 * Set the pointer and size of property in the soft state
		 */
		softsp->memlayoutp = dimminfop;
		softsp->size = len;
	} else if (err == DDI_PROP_NOT_FOUND) {
		/*
		 * This is a disable MC. Clear out the pointer and size
		 * of property in the soft state
		 */
		softsp->memlayoutp = NULL;
		softsp->size = 0;
	} else {
		DPRINTF(MC_ATTACH_DEBUG, ("mc%d is disabled: dimminfop %p\n",
		    instance, (void *)dimminfop));
		goto bad2;
	}

	DPRINTF(MC_ATTACH_DEBUG, ("mc%d: dimminfop=0x%p data=0x%lx len=%d\n",
	    instance, (void *)dimminfop, *(uint64_t *)dimminfop, len));

	/* Get MC registers and construct all needed data structure */
	if (mc_get_mcregs(softsp) == -1)
		goto bad1;

	mutex_enter(&mcmutex);
	if (nmcs == 1) {
		if (&p2get_mem_unum)
			p2get_mem_unum = mc_get_mem_unum;
		if (&p2get_mem_info)
			p2get_mem_info = mc_get_mem_info;
		if (&p2get_mem_sid)
			p2get_mem_sid = mc_get_mem_sid;
		if (&p2get_mem_offset)
			p2get_mem_offset = mc_get_mem_offset;
		if (&p2get_mem_addr)
			p2get_mem_addr = mc_get_mem_addr;
		if (&p2init_sid_cache)
			p2init_sid_cache = mc_init_sid_cache;
	}

	mutex_exit(&mcmutex);

	/*
	 * Update DIMM serial id information if the DIMM serial id
	 * cache has already been initialized.
	 */
	if (mc_dimm_sids) {
		rw_enter(&mcdimmsids_rw, RW_WRITER);
		(void) mc_populate_sid_cache();
		rw_exit(&mcdimmsids_rw);
	}

	if (ddi_create_minor_node(devi, "mc-us3", S_IFCHR, instance,
	    "ddi_mem_ctrl", 0) != DDI_SUCCESS) {
		DPRINTF(MC_ATTACH_DEBUG, ("mc_attach: create_minor_node"
		    " failed \n"));
		goto bad1;
	}

	ddi_report_dev(devi);
	return (DDI_SUCCESS);

bad1:
	/* release all allocated data struture for this MC */
	mlayout_del(softsp->portid, 0);
	if (softsp->memlayoutp != NULL)
		kmem_free(softsp->memlayoutp, softsp->size);

	/* remove the libdevinfo property */
	if (ddi_prop_exists(DDI_DEV_T_ANY, softsp->dip,
	    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS,
	    MEM_CFG_PROP_NAME) == 1) {
		(void) ddi_prop_remove(DDI_DEV_T_NONE, softsp->dip,
		    MEM_CFG_PROP_NAME);
	}

bad2:
	/* unmap the registers for this device. */
	ddi_unmap_regs(softsp->dip, 0, (caddr_t *)&softsp->mc_base, 0, 0);

bad:
	ddi_soft_state_free(mcp, instance);
	return (DDI_FAILURE);
}

/* ARGSUSED */
static int
mc_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
{
	int instance;
	struct mc_soft_state *softsp;

	/* get the instance of this devi */
	instance = ddi_get_instance(devi);

	/* get the soft state pointer for this device node */
	softsp = ddi_get_soft_state(mcp, instance);

	switch (cmd) {
	case DDI_SUSPEND:
		return (DDI_SUCCESS);

	case DDI_DETACH:
		break;

	default:
		return (DDI_FAILURE);
	}

	DPRINTF(MC_DETACH_DEBUG, ("mc%d DETACH: portid= %d, table 0x%p\n",
	    instance, softsp->portid, softsp->memlayoutp));

	/* remove the libdevinfo property */
	if (ddi_prop_exists(DDI_DEV_T_ANY, softsp->dip,
	    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS,
	    MEM_CFG_PROP_NAME) == 1) {
		(void) ddi_prop_remove(DDI_DEV_T_NONE, softsp->dip,
		    MEM_CFG_PROP_NAME);
	}

	/* release all allocated data struture for this MC */
	mlayout_del(softsp->portid, 1);
	if (softsp->memlayoutp != NULL)
		kmem_free(softsp->memlayoutp, softsp->size);

	/* unmap the registers */
	ddi_unmap_regs(softsp->dip, 0, (caddr_t *)&softsp->mc_base, 0, 0);

	mutex_enter(&mcmutex);
	if (nmcs == 0) {
		if (&p2get_mem_unum)
			p2get_mem_unum = NULL;
		if (&p2get_mem_info)
			p2get_mem_info = NULL;
		if (&p2get_mem_sid)
			p2get_mem_sid = NULL;
		if (&p2get_mem_offset)
			p2get_mem_offset = NULL;
		if (&p2get_mem_addr)
			p2get_mem_addr = NULL;
		if (&p2init_sid_cache)
			p2init_sid_cache = NULL;
	}

	mutex_exit(&mcmutex);

	ddi_remove_minor_node(devi, NULL);

	/* free up the soft state */
	ddi_soft_state_free(mcp, instance);

	return (DDI_SUCCESS);
}

/* ARGSUSED */
static int
mc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
{

	/* verify that otyp is appropriate */
	if (otyp != OTYP_CHR) {
		return (EINVAL);
	}

	return (0);
}

/* ARGSUSED */
static int
mc_close(dev_t devp, int flag, int otyp, cred_t *credp)
{
	return (0);
}

/*
 * cmd includes MCIOC_MEMCONF, MCIOC_MEM, MCIOC_SEG, MCIOC_BANK, MCIOC_DEVGRP,
 * MCIOC_CTRLCONF, MCIOC_CONTROL.
 *
 * MCIOC_MEM, MCIOC_SEG, MCIOC_CTRLCONF, and MCIOC_CONTROL are
 * associated with various length struct. If given number is less than the
 * number in kernel, update the number and return EINVAL so that user could
 * allocate enough space for it.
 *
 */

/* ARGSUSED */
static int
mc_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cred_p,
	int *rval_p)
{
	size_t	size;
	struct mc_memconf mcmconf;
	struct mc_memory *mcmem, mcmem_in;
	struct mc_segment *mcseg, mcseg_in;
	struct mc_bank mcbank;
	struct mc_devgrp mcdevgrp;
	struct mc_ctrlconf *mcctrlconf, mcctrlconf_in;
	struct mc_control *mccontrol, mccontrol_in;
	struct seg_info *seg = NULL;
	struct bank_info *bank = NULL;
	struct dgrp_info *dgrp = NULL;
	struct mctrl_info *mcport;
	mc_dlist_t *mctrl;
	int i, status = 0;
	cpu_t *cpu;

	switch (cmd) {
	case MCIOC_MEMCONF:
		mutex_enter(&mcdatamutex);

		mcmconf.nmcs = nmcs;
		mcmconf.nsegments = nsegments;
		mcmconf.nbanks = maxbanks;
		mcmconf.ndevgrps = NDGRPS;
		mcmconf.ndevs = NDIMMS;
		mcmconf.len_dev = MAX_DEVLEN;
		mcmconf.xfer_size = TRANSFER_SIZE;

		mutex_exit(&mcdatamutex);

		if (copyout(&mcmconf, (void *)arg, sizeof (struct mc_memconf)))
			return (EFAULT);
		return (0);

	/*
	 * input: nsegments and allocate space for various length of segmentids
	 *
	 * return    0: size, number of segments, and all segment ids,
	 *		where glocal and local ids are identical.
	 *	EINVAL: if the given nsegments is less than that in kernel and
	 *		nsegments of struct will be updated.
	 *	EFAULT: if other errors in kernel.
	 */
	case MCIOC_MEM:
		if (copyin((void *)arg, &mcmem_in,
		    sizeof (struct mc_memory)) != 0)
			return (EFAULT);

		mutex_enter(&mcdatamutex);
		if (mcmem_in.nsegments < nsegments) {
			mcmem_in.nsegments = nsegments;
			if (copyout(&mcmem_in, (void *)arg,
			    sizeof (struct mc_memory)))
				status = EFAULT;
			else
				status = EINVAL;

			mutex_exit(&mcdatamutex);
			return (status);
		}

		size = sizeof (struct mc_memory) + (nsegments - 1) *
		    sizeof (mcmem->segmentids[0]);
		mcmem = kmem_zalloc(size, KM_SLEEP);

		mcmem->size = memsize;
		mcmem->nsegments = nsegments;
		seg = (struct seg_info *)seg_head;
		for (i = 0; i < nsegments; i++) {
			ASSERT(seg != NULL);
			mcmem->segmentids[i].globalid = seg->seg_node.id;
			mcmem->segmentids[i].localid = seg->seg_node.id;
			seg = (struct seg_info *)seg->seg_node.next;
		}
		mutex_exit(&mcdatamutex);

		if (copyout(mcmem, (void *)arg, size))
			status = EFAULT;

		kmem_free(mcmem, size);
		return (status);

	/*
	 * input: id, nbanks and allocate space for various length of bankids
	 *
	 * return    0: base, size, number of banks, and all bank ids,
	 *		where global id is unique of all banks and local id
	 *		is only unique for mc.
	 *	EINVAL: either id isn't found or if given nbanks is less than
	 *		that in kernel and nbanks of struct will be updated.
	 *	EFAULT: if other errors in kernel.
	 */
	case MCIOC_SEG:

		if (copyin((void *)arg, &mcseg_in,
		    sizeof (struct mc_segment)) != 0)
			return (EFAULT);

		mutex_enter(&mcdatamutex);
		if ((seg = (struct seg_info *)mc_node_get(mcseg_in.id,
		    seg_head)) == NULL) {
			DPRINTF(MC_CMD_DEBUG, ("MCIOC_SEG: seg not match, "
			    "id %d\n", mcseg_in.id));
			mutex_exit(&mcdatamutex);
			return (EFAULT);
		}

		if (mcseg_in.nbanks < seg->nbanks) {
			mcseg_in.nbanks = seg->nbanks;
			if (copyout(&mcseg_in, (void *)arg,
			    sizeof (struct mc_segment)))
				status = EFAULT;
			else
				status = EINVAL;

			mutex_exit(&mcdatamutex);
			return (status);
		}

		size = sizeof (struct mc_segment) + (seg->nbanks - 1) *
		    sizeof (mcseg->bankids[0]);
		mcseg = kmem_zalloc(size, KM_SLEEP);

		mcseg->id = seg->seg_node.id;
		mcseg->ifactor = seg->ifactor;
		mcseg->base = seg->base;
		mcseg->size = seg->size;
		mcseg->nbanks = seg->nbanks;

		bank = seg->hb_inseg;

		DPRINTF(MC_CMD_DEBUG, ("MCIOC_SEG:nbanks %d seg 0x%p bank %p\n",
		    seg->nbanks, (void *)seg, (void *)bank));

		i = 0;
		while (bank != NULL) {
			DPRINTF(MC_CMD_DEBUG, ("MCIOC_SEG:idx %d bank_id %d\n",
			    i, bank->bank_node.id));
			mcseg->bankids[i].globalid = bank->bank_node.id;
			mcseg->bankids[i++].localid =
			    bank->local_id;
			bank = bank->n_inseg;
		}
		ASSERT(i == seg->nbanks);
		mutex_exit(&mcdatamutex);

		if (copyout(mcseg, (void *)arg, size))
			status = EFAULT;

		kmem_free(mcseg, size);
		return (status);

	/*
	 * input: id
	 *
	 * return    0: mask, match, size, and devgrpid,
	 *		where global id is unique of all devgrps and local id
	 *		is only unique for mc.
	 *	EINVAL: if id isn't found
	 *	EFAULT: if other errors in kernel.
	 */
	case MCIOC_BANK:
		if (copyin((void *)arg, &mcbank, sizeof (struct mc_bank)) != 0)
			return (EFAULT);

		DPRINTF(MC_CMD_DEBUG, ("MCIOC_BANK: bank id %d\n", mcbank.id));

		mutex_enter(&mcdatamutex);

		if ((bank = (struct bank_info *)mc_node_get(mcbank.id,
		    bank_head)) == NULL) {
			mutex_exit(&mcdatamutex);
			return (EINVAL);
		}

		DPRINTF(MC_CMD_DEBUG, ("MCIOC_BANK: bank %d (0x%p) valid %hu\n",
		    bank->bank_node.id, (void *)bank, bank->valid));

		/*
		 * If (Physic Address & MASK) == MATCH, Physic Address is
		 * located at this bank. The lower physical address bits
		 * are at [9-6].
		 */
		mcbank.mask = (~(bank->lk | ~(MADR_LK_MASK >>
		    MADR_LK_SHIFT))) << MADR_LPA_SHIFT;
		mcbank.match = bank->lm << MADR_LPA_SHIFT;
		mcbank.size = bank->size;
		mcbank.devgrpid.globalid = bank->devgrp_id;
		mcbank.devgrpid.localid = bank->devgrp_id % NDGRPS;

		mutex_exit(&mcdatamutex);

		if (copyout(&mcbank, (void *)arg, sizeof (struct mc_bank)))
			return (EFAULT);
		return (0);

	/*
	 * input:id and allocate space for various length of deviceids
	 *
	 * return    0: size and number of devices.
	 *	EINVAL: id isn't found
	 *	EFAULT: if other errors in kernel.
	 */
	case MCIOC_DEVGRP:

		if (copyin((void *)arg, &mcdevgrp,
		    sizeof (struct mc_devgrp)) != 0)
			return (EFAULT);

		mutex_enter(&mcdatamutex);
		if ((dgrp = (struct dgrp_info *)mc_node_get(mcdevgrp.id,
		    dgrp_head)) == NULL) {
			DPRINTF(MC_CMD_DEBUG, ("MCIOC_DEVGRP: not match, id "
			    "%d\n", mcdevgrp.id));
			mutex_exit(&mcdatamutex);
			return (EINVAL);
		}

		mcdevgrp.ndevices = dgrp->ndevices;
		mcdevgrp.size = dgrp->size;

		mutex_exit(&mcdatamutex);

		if (copyout(&mcdevgrp, (void *)arg, sizeof (struct mc_devgrp)))
			status = EFAULT;

		return (status);

	/*
	 * input: nmcs and allocate space for various length of mcids
	 *
	 * return    0: number of mc, and all mcids,
	 *		where glocal and local ids are identical.
	 *	EINVAL: if the given nmcs is less than that in kernel and
	 *		nmcs of struct will be updated.
	 *	EFAULT: if other errors in kernel.
	 */
	case MCIOC_CTRLCONF:
		if (copyin((void *)arg, &mcctrlconf_in,
		    sizeof (struct mc_ctrlconf)) != 0)
			return (EFAULT);

		mutex_enter(&mcdatamutex);
		if (mcctrlconf_in.nmcs < nmcs) {
			mcctrlconf_in.nmcs = nmcs;
			if (copyout(&mcctrlconf_in, (void *)arg,
			    sizeof (struct mc_ctrlconf)))
				status = EFAULT;
			else
				status = EINVAL;

			mutex_exit(&mcdatamutex);
			return (status);
		}

		/*
		 * Cannot just use the size of the struct because of the various
		 * length struct
		 */
		size = sizeof (struct mc_ctrlconf) + ((nmcs - 1) *
		    sizeof (mcctrlconf->mcids[0]));
		mcctrlconf = kmem_zalloc(size, KM_SLEEP);

		mcctrlconf->nmcs = nmcs;

		/* Get all MC ids and add to mcctrlconf */
		mctrl = mctrl_head;
		i = 0;
		while (mctrl != NULL) {
			mcctrlconf->mcids[i].globalid = mctrl->id;
			mcctrlconf->mcids[i].localid = mctrl->id;
			i++;
			mctrl = mctrl->next;
		}
		ASSERT(i == nmcs);

		mutex_exit(&mcdatamutex);

		if (copyout(mcctrlconf, (void *)arg, size))
			status = EFAULT;

		kmem_free(mcctrlconf, size);
		return (status);

	/*
	 * input:id, ndevgrps and allocate space for various length of devgrpids
	 *
	 * return    0: number of devgrp, and all devgrpids,
	 *		is unique of all devgrps and local id is only unique
	 *		for mc.
	 *	EINVAL: either if id isn't found or if the given ndevgrps is
	 *		less than that in kernel and ndevgrps of struct will
	 *		be updated.
	 *	EFAULT: if other errors in kernel.
	 */
	case MCIOC_CONTROL:
		if (copyin((void *)arg, &mccontrol_in,
		    sizeof (struct mc_control)) != 0)
			return (EFAULT);

		mutex_enter(&mcdatamutex);
		if ((mcport = (struct mctrl_info *)mc_node_get(mccontrol_in.id,
		    mctrl_head)) == NULL) {
			mutex_exit(&mcdatamutex);
			return (EINVAL);
		}

		/*
		 * mcport->ndevgrps zero means Memory Controller is disable.
		 */
		if ((mccontrol_in.ndevgrps < mcport->ndevgrps) ||
		    (mcport->ndevgrps == 0)) {
			mccontrol_in.ndevgrps = mcport->ndevgrps;
			if (copyout(&mccontrol_in, (void *)arg,
			    sizeof (struct mc_control)))
				status = EFAULT;
			else if (mcport->ndevgrps != 0)
				status = EINVAL;

			mutex_exit(&mcdatamutex);
			return (status);
		}

		size = sizeof (struct mc_control) + (mcport->ndevgrps - 1) *
		    sizeof (mccontrol->devgrpids[0]);
		mccontrol = kmem_zalloc(size, KM_SLEEP);

		mccontrol->id = mcport->mctrl_node.id;
		mccontrol->ndevgrps = mcport->ndevgrps;
		for (i = 0; i < mcport->ndevgrps; i++) {
			mccontrol->devgrpids[i].globalid = mcport->devgrpids[i];
			mccontrol->devgrpids[i].localid =
			    mcport->devgrpids[i] % NDGRPS;
			DPRINTF(MC_CMD_DEBUG, ("MCIOC_CONTROL: devgrp id %lu\n",
			    *(uint64_t *)&mccontrol->devgrpids[i]));
		}
		mutex_exit(&mcdatamutex);

		if (copyout(mccontrol, (void *)arg, size))
			status = EFAULT;

		kmem_free(mccontrol, size);
		return (status);

	/*
	 * input:id
	 *
	 * return    0: CPU flushed successfully.
	 *	EINVAL: the id wasn't found
	 */
	case MCIOC_ECFLUSH:
		mutex_enter(&cpu_lock);
		cpu = cpu_get((processorid_t)arg);
		mutex_exit(&cpu_lock);
		if (cpu == NULL)
			return (EINVAL);

		xc_one(arg, (xcfunc_t *)cpu_flush_ecache, 0, 0);

		return (0);

	default:
		DPRINTF(MC_CMD_DEBUG, ("DEFAULT: cmd is wrong\n"));
		return (EFAULT);
	}
}

/*
 * Get Memory Address Decoding Registers and construct list.
 * flag is to workaround Cheetah's restriction where register cannot be mapped
 * if port id(MC registers on it) == cpu id(process is running on it).
 */
static int
mc_get_mcregs(struct mc_soft_state *softsp)
{
	int i;
	int err = 0;
	uint64_t madreg;
	uint64_t ma_reg_array[NBANKS];	/* there are NBANKS of madrs */

	/* Construct lists for MC, mctrl_info, dgrp_info, and device_info */
	mc_construct(softsp->portid, softsp->memlayoutp);

	/*
	 * If memlayoutp is NULL, the Memory Controller is disable, and
	 * doesn't need to create any bank and segment.
	 */
	if (softsp->memlayoutp == NULL)
		goto exit;

	/*
	 * Get the content of 4 Memory Address Decoding Registers, and
	 * construct lists of logical banks and segments.
	 */
	for (i = 0; i < NBANKS; i++) {
		DPRINTF(MC_REG_DEBUG, ("get_mcregs: mapreg=0x%p portid=%d "
		    "cpu=%d\n", (void *)softsp->mc_base, softsp->portid,
		    CPU->cpu_id));

		kpreempt_disable();
		if (softsp->portid == (cpunodes[CPU->cpu_id].portid))
			madreg = get_mcr(MADR0OFFSET + (i * REGOFFSET));
		else
			madreg = *((uint64_t *)(softsp->mc_base + MADR0OFFSET +
			    (i * REGOFFSET)));
		kpreempt_enable();

		DPRINTF(MC_REG_DEBUG, ("get_mcregs 2: memlayoutp=0x%p madreg "
		    "reg=0x%lx\n", softsp->memlayoutp, madreg));

		ma_reg_array[i] = madreg;

		if ((err = mlayout_add(softsp->portid, i, madreg,
		    softsp->memlayoutp)) == -1)
			break;
	}

	/*
	 * Create the logical bank property for this mc node. This
	 * property is an encoded array of the madr for each logical
	 * bank (there are NBANKS of these).
	 */
	if (ddi_prop_exists(DDI_DEV_T_ANY, softsp->dip,
	    DDI_PROP_NOTPROM | DDI_PROP_DONTPASS,
	    MEM_CFG_PROP_NAME) != 1) {
		(void) ddi_prop_create(DDI_DEV_T_NONE, softsp->dip,
		    DDI_PROP_CANSLEEP, MEM_CFG_PROP_NAME,
		    (caddr_t)&ma_reg_array, sizeof (ma_reg_array));
	}

exit:
	if (!err) {
		mutex_enter(&mcdatamutex);
		nmcs++;
		mutex_exit(&mcdatamutex);
	}
	return (err);
}

/*
 * Translate a <DIMM, offset> pair to a physical address.
 */
static int
mc_offset_to_addr(struct seg_info *seg,
    struct bank_info *bank, uint64_t off, uint64_t *addr)
{
	uint64_t base, size, line, remainder;
	uint32_t ifactor;

	/*
	 * Compute the half-dimm size in bytes.
	 * Note that bank->size represents the number of data bytes,
	 * and does not include the additional bits used for ecc, mtag,
	 * and mtag ecc information in each 144-bit checkword.
	 * For calculating the offset to a checkword we need the size
	 * including the additional 8 bytes for each 64 data bytes of
	 * a cache line.
	 */
	size = ((bank->size / 4) / 64) * 72;

	/*
	 * Check if the offset is within this bank. This depends on the position
	 * of the bank, i.e., whether it is the front bank or the back bank.
	 */
	base = size * bank->pos;

	if ((off < base) || (off >= (base + size)))
		return (-1);

	/*
	 * Compute the offset within the half-dimm.
	 */
	off -= base;

	/*
	 * Compute the line within the half-dimm. This is the same as the line
	 * within the bank since each DIMM in a bank contributes uniformly
	 * 144 bits (18 bytes) to a cache line.
	 */
	line = off / QWORD_SIZE_BYTES;

	remainder = off % QWORD_SIZE_BYTES;

	/*
	 * Compute the line within the segment.
	 * The bank->lm field indicates the order in which cache lines are
	 * distributed across the banks of a segment (See the Cheetah PRM).
	 * The interleave factor the bank is programmed with is used instead
	 * of the segment interleave factor since a segment can be composed
	 * of banks with different interleave factors if the banks are not
	 * uniform in size.
	 */
	ifactor = (bank->lk ^ 0xF) + 1;
	line = (line * ifactor) + bank->lm;

	/*
	 * Compute the physical address assuming that there are 64 data bytes
	 * in a cache line.
	 */
	*addr = (line << 6) + seg->base;
	*addr += remainder * 16;

	return (0);
}

/*
 * Translate a physical address to a <DIMM, offset> pair.
 */
static void
mc_addr_to_offset(struct seg_info *seg,
    struct bank_info *bank, uint64_t addr, uint64_t *off)
{
	uint64_t base, size, line, remainder;
	uint32_t ifactor;

	/*
	 * Compute the line within the segment assuming that there are 64 data
	 * bytes in a cache line.
	 */
	line = (addr - seg->base) / 64;

	/*
	 * The lm (lower match) field from the Memory Address Decoding Register
	 * for this bank determines which lines within a memory segment this
	 * bank should respond to.  These are the actual address bits the
	 * interleave is done over (See the Cheetah PRM).
	 * In other words, the lm field indicates the order in which the cache
	 * lines are distributed across the banks of a segment, and thusly it
	 * can be used to compute the line within this bank. This is the same as
	 * the line within the half-dimm. This is because each DIMM in a bank
	 * contributes uniformly to every cache line.
	 */
	ifactor = (bank->lk ^ 0xF) + 1;
	line = (line - bank->lm)/ifactor;

	/*
	 * Compute the offset within the half-dimm. This depends on whether
	 * or not the bank is a front logical bank or a back logical bank.
	 */
	*off = line * QWORD_SIZE_BYTES;

	/*
	 * Compute the half-dimm size in bytes.
	 * Note that bank->size represents the number of data bytes,
	 * and does not include the additional bits used for ecc, mtag,
	 * and mtag ecc information in each 144-bit quadword.
	 * For calculating the offset to a checkword we need the size
	 * including the additional 8 bytes for each 64 data bytes of
	 * a cache line.
	 */
	size = ((bank->size / 4) / 64) * 72;

	/*
	 * Compute the offset within the dimm to the nearest line. This depends
	 * on whether or not the bank is a front logical bank or a back logical
	 * bank.
	 */
	base = size * bank->pos;
	*off += base;

	remainder = (addr - seg->base) % 64;
	remainder /= 16;
	*off += remainder;
}

/*
 * A cache line is composed of four quadwords with the associated ECC, the
 * MTag along with its associated ECC. This is depicted below:
 *
 * |                    Data                    |   ECC   | Mtag |MTag ECC|
 *  127                                         0 8       0 2    0 3      0
 *
 * synd_code will be mapped as the following order to mc_get_mem_unum.
 *  143                                         16        7      4        0
 *
 * |  Quadword  0  |  Quadword  1  |  Quadword  2  |  Quadword  3  |
 *  575         432 431         288 287         144 143		   0
 *
 * dimm table: each bit at a cache line needs two bits to present one of
 *      four dimms. So it needs 144 bytes(576 * 2 / 8). The content is in
 *      big edian order, i.e. dimm_table[0] presents for bit 572 to 575.
 *
 * pin table: each bit at a cache line needs one byte to present pin position,
 *      where max. is 230. So it needs 576 bytes. The order of table index is
 *      the same as bit position at a cache line, i.e. pin_table[0] presents
 *      for bit 0, Mtag ECC 0 of Quadword 3.
 *
 * This is a mapping from syndrome code to QuadWord Logical layout at Safari.
 * Referring to Figure 3-4, Excalibur Architecture Manual.
 * This table could be moved to cheetah.c if other platform teams agree with
 * the bit layout at QuadWord.
 */

static uint8_t qwordmap[] =
{
16,   17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
32,   33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
48,   49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
64,   65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
80,   81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
96,   97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
7,    8,   9,  10,  11,  12,  13,  14,  15,   4,   5,   6,   0,   1,   2,   3,
};


/* ARGSUSED */
static int
mc_get_mem_unum(int synd_code, uint64_t paddr, char *buf, int buflen, int *lenp)
{
	int i, upper_pa, lower_pa, dimmoffset;
	int quadword, pos_cacheline, position, index, idx4dimm;
	int qwlayout = synd_code;
	short offset, data;
	char unum[UNUM_NAMLEN];
	struct dimm_info *dimmp;
	struct pin_info *pinp;
	struct bank_info *bank;

	/*
	 * Enforce old Openboot requirement for synd code, either a single-bit
	 * code from 0..QWORD_SIZE-1 or -1 (multi-bit error).
	 */
	if (qwlayout < -1 || qwlayout >= QWORD_SIZE)
		return (EINVAL);

	unum[0] = '\0';

	upper_pa = (paddr & MADR_UPA_MASK) >> MADR_UPA_SHIFT;
	lower_pa = (paddr & MADR_LPA_MASK) >> MADR_LPA_SHIFT;

	DPRINTF(MC_GUNUM_DEBUG, ("qwlayout %d\n", qwlayout));

	/*
	 * Scan all logical banks to get one responding to the physical
	 * address. Then compute the index to look up dimm and pin tables
	 * to generate the unum.
	 */
	mutex_enter(&mcdatamutex);
	bank = (struct bank_info *)bank_head;
	while (bank != NULL) {
		int bankid, mcid, bankno_permc;

		bankid = bank->bank_node.id;
		bankno_permc = bankid % NBANKS;
		mcid = bankid / NBANKS;

		/*
		 * The Address Decoding logic decodes the different fields
		 * in the Memory Address Decoding register to determine
		 * whether a particular logical bank should respond to a
		 * physical address.
		 */
		if ((!bank->valid) || ((~(~(upper_pa ^ bank->um) |
		    bank->uk)) || (~(~(lower_pa ^ bank->lm) | bank->lk)))) {
			bank = (struct bank_info *)bank->bank_node.next;
			continue;
		}

		dimmoffset = (bankno_permc % NDGRPS) * NDIMMS;

		dimmp = (struct dimm_info *)bank->dimminfop;
		ASSERT(dimmp != NULL);

		if ((qwlayout >= 0) && (qwlayout < QWORD_SIZE)) {
			/*
			 * single-bit error handling, we can identify specific
			 * DIMM.
			 */

			pinp = (struct pin_info *)&dimmp->data[0];

			if (!dimmp->sym_flag)
				pinp++;

			quadword = (paddr & 0x3f) / 16;
			/* or quadword = (paddr >> 4) % 4; */
			pos_cacheline = ((3 - quadword) * QWORD_SIZE) +
			    qwordmap[qwlayout];
			position = 575 - pos_cacheline;
			index = position * 2 / 8;
			offset = position % 4;

			/*
			 * Trade-off: We couldn't add pin number to
			 * unum string because statistic number
			 * pumps up at the corresponding dimm not pin.
			 * (void) sprintf(unum, "Pin %1u ", (uint_t)
			 * pinp->pintable[pos_cacheline]);
			 */
			DPRINTF(MC_GUNUM_DEBUG, ("Pin number %1u\n",
			    (uint_t)pinp->pintable[pos_cacheline]));
			data = pinp->dimmtable[index];
			idx4dimm = (data >> ((3 - offset) * 2)) & 3;

			(void) strncpy(unum,
			    (char *)dimmp->label[dimmoffset + idx4dimm],
			    UNUM_NAMLEN);
			DPRINTF(MC_GUNUM_DEBUG, ("unum %s\n", unum));
			/*
			 * platform hook for adding label information to unum.
			 */
			mc_add_mem_unum_label(unum, mcid, bankno_permc,
			    idx4dimm);
		} else {
			char *p = unum;
			size_t res = UNUM_NAMLEN;

			/*
			 * multi-bit error handling, we can only identify
			 * bank of DIMMs.
			 */

			for (i = 0; (i < NDIMMS) && (res > 0); i++) {
				(void) snprintf(p, res, "%s%s",
				    i == 0 ? "" : " ",
				    (char *)dimmp->label[dimmoffset + i]);
				res -= strlen(p);
				p += strlen(p);
			}

			/*
			 * platform hook for adding label information
			 * to unum.
			 */
			mc_add_mem_unum_label(unum, mcid, bankno_permc, -1);
		}
		mutex_exit(&mcdatamutex);
		if ((strlen(unum) >= UNUM_NAMLEN) ||
		    (strlen(unum) >= buflen)) {
			return (ENAMETOOLONG);
		} else {
			(void) strncpy(buf, unum, buflen);
			*lenp = strlen(buf);
			return (0);
		}
	}	/* end of while loop for logical bank list */

	mutex_exit(&mcdatamutex);
	return (ENXIO);
}

/* ARGSUSED */
static int
mc_get_mem_offset(uint64_t paddr, uint64_t *offp)
{
	int upper_pa, lower_pa;
	struct bank_info *bank;
	struct seg_info *seg;

	upper_pa = (paddr & MADR_UPA_MASK) >> MADR_UPA_SHIFT;
	lower_pa = (paddr & MADR_LPA_MASK) >> MADR_LPA_SHIFT;

	/*
	 * Scan all logical banks to get one responding to the physical
	 * address.
	 */
	mutex_enter(&mcdatamutex);
	bank = (struct bank_info *)bank_head;
	while (bank != NULL) {
		/*
		 * The Address Decoding logic decodes the different fields
		 * in the Memory Address Decoding register to determine
		 * whether a particular logical bank should respond to a
		 * physical address.
		 */
		if ((!bank->valid) || ((~(~(upper_pa ^ bank->um) |
		    bank->uk)) || (~(~(lower_pa ^ bank->lm) | bank->lk)))) {
			bank = (struct bank_info *)bank->bank_node.next;
			continue;
		}

		seg = (struct seg_info *)mc_node_get(bank->seg_id, seg_head);
		ASSERT(seg != NULL);
		ASSERT(paddr >= seg->base);

		mc_addr_to_offset(seg, bank, paddr, offp);

		mutex_exit(&mcdatamutex);
		return (0);
	}

	mutex_exit(&mcdatamutex);
	return (ENXIO);
}

/*
 * Translate a DIMM <id, offset> pair to a physical address.
 */
static int
mc_get_mem_addr(int mcid, char *sid, uint64_t off, uint64_t *paddr)
{
	struct seg_info *seg;
	struct bank_info *bank;
	int first_seg_id;
	int i, found;

	ASSERT(sid != NULL);

	mutex_enter(&mcdatamutex);

	rw_enter(&mcdimmsids_rw, RW_READER);

	/*
	 * If DIMM serial ids have not been cached yet, tell the
	 * caller to try again.
	 */
	if (mc_dimm_sids == NULL) {
		rw_exit(&mcdimmsids_rw);
		return (EAGAIN);
	}

	for (i = 0; i < max_entries; i++) {
		if (mc_dimm_sids[i].mcid == mcid)
			break;
	}

	if (i == max_entries) {
		rw_exit(&mcdimmsids_rw);
		mutex_exit(&mcdatamutex);
		return (ENODEV);
	}

	first_seg_id = mc_dimm_sids[i].seg_id;

	seg = (struct seg_info *)mc_node_get(first_seg_id, seg_head);

	rw_exit(&mcdimmsids_rw);

	if (seg == NULL) {
		mutex_exit(&mcdatamutex);
		return (ENODEV);
	}

	found = 0;

	for (bank = seg->hb_inseg; bank; bank = bank->n_inseg) {
		ASSERT(bank->valid);

		for (i = 0; i < NDIMMS; i++) {
			if (strncmp((char *)bank->dimmsidp[i], sid,
			    DIMM_SERIAL_ID_LEN)  == 0)
				break;
		}

		if (i == NDIMMS)
			continue;

		if (mc_offset_to_addr(seg, bank, off, paddr) == -1)
			continue;
		found = 1;
		break;
	}

	if (found) {
		mutex_exit(&mcdatamutex);
		return (0);
	}

	/*
	 * If a bank wasn't found, it may be in another segment.
	 * This can happen if the different logical banks of an MC
	 * have different interleave factors.  To deal with this
	 * possibility, we'll do a brute-force search for banks
	 * for this MC with a different seg id then above.
	 */
	bank = (struct bank_info *)bank_head;
	while (bank != NULL) {

		if (!bank->valid) {
			bank = (struct bank_info *)bank->bank_node.next;
			continue;
		}

		if (bank->bank_node.id / NBANKS != mcid) {
			bank = (struct bank_info *)bank->bank_node.next;
			continue;
		}

		/* Ignore banks in the segment we looked in above. */
		if (bank->seg_id == mc_dimm_sids[i].seg_id) {
			bank = (struct bank_info *)bank->bank_node.next;
			continue;
		}

		for (i = 0; i < NDIMMS; i++) {
			if (strncmp((char *)bank->dimmsidp[i], sid,
			    DIMM_SERIAL_ID_LEN)  == 0)
				break;
		}

		if (i == NDIMMS) {
			bank = (struct bank_info *)bank->bank_node.next;
			continue;
		}

		seg = (struct seg_info *)mc_node_get(bank->seg_id, seg_head);

		if (mc_offset_to_addr(seg, bank, off, paddr) == -1) {
			bank = (struct bank_info *)bank->bank_node.next;
			continue;
		}

		found = 1;
		break;
	}

	mutex_exit(&mcdatamutex);

	if (found)
		return (0);
	else
		return (ENOENT);
}

static int
mc_get_mem_info(int synd_code, uint64_t paddr,
    uint64_t *mem_sizep, uint64_t *seg_sizep, uint64_t *bank_sizep,
    int *segsp, int *banksp, int *mcidp)
{
	int upper_pa, lower_pa;
	struct bank_info *bankp;

	if (synd_code < -1 || synd_code >= QWORD_SIZE)
		return (EINVAL);

	upper_pa = (paddr & MADR_UPA_MASK) >> MADR_UPA_SHIFT;
	lower_pa = (paddr & MADR_LPA_MASK) >> MADR_LPA_SHIFT;

	/*
	 * Scan all logical banks to get one responding to the physical
	 * address.
	 */
	mutex_enter(&mcdatamutex);
	bankp = (struct bank_info *)bank_head;
	while (bankp != NULL) {
		struct seg_info *segp;
		int bankid, mcid;

		bankid = bankp->bank_node.id;
		mcid = bankid / NBANKS;

		/*
		 * The Address Decoding logic decodes the different fields
		 * in the Memory Address Decoding register to determine
		 * whether a particular logical bank should respond to a
		 * physical address.
		 */
		if ((!bankp->valid) || ((~(~(upper_pa ^ bankp->um) |
		    bankp->uk)) || (~(~(lower_pa ^ bankp->lm) | bankp->lk)))) {
			bankp = (struct bank_info *)bankp->bank_node.next;
			continue;
		}

		/*
		 * Get the corresponding segment.
		 */
		if ((segp = (struct seg_info *)mc_node_get(bankp->seg_id,
		    seg_head)) == NULL) {
			mutex_exit(&mcdatamutex);
			return (EFAULT);
		}

		*mem_sizep = memsize;
		*seg_sizep = segp->size;
		*bank_sizep = bankp->size;
		*segsp = nsegments;
		*banksp = segp->nbanks;
		*mcidp = mcid;

		mutex_exit(&mcdatamutex);

		return (0);

	}	/* end of while loop for logical bank list */

	mutex_exit(&mcdatamutex);
	return (ENXIO);
}

/*
 * Construct lists for an enabled MC where size of memory is 0.
 * The lists are connected as follows:
 * Attached MC -> device group list -> device list(per devgrp).
 */
static void
mc_construct(int mc_id, void *dimminfop)
{
	int i, j, idx, dmidx;
	struct mctrl_info *mctrl;
	struct dgrp_info *dgrp;
	struct device_info *dev;
	struct	dimm_info *dimmp = (struct  dimm_info *)dimminfop;

	mutex_enter(&mcdatamutex);
	/* allocate for mctrl_info and bank_info */
	if ((mctrl = (struct mctrl_info *)mc_node_get(mc_id,
	    mctrl_head)) != NULL) {
		cmn_err(CE_WARN, "mc_construct: mctrl %d exists\n", mc_id);
		mutex_exit(&mcdatamutex);
		return;
	}

	mctrl = kmem_zalloc(sizeof (struct mctrl_info), KM_SLEEP);

	/*
	 * If dimminfop is NULL, the Memory Controller is disable, and
	 * the number of device group will be zero.
	 */
	if (dimminfop == NULL) {
		mctrl->mctrl_node.id = mc_id;
		mctrl->ndevgrps = 0;
		mc_node_add((mc_dlist_t *)mctrl, &mctrl_head, &mctrl_tail);
		mutex_exit(&mcdatamutex);
		return;
	}

	/* add the entry on dgrp_info list */
	for (i = 0; i < NDGRPS; i++) {
		idx = mc_id * NDGRPS + i;
		mctrl->devgrpids[i] = idx;
		if ((dgrp = (struct dgrp_info *)mc_node_get(idx, dgrp_head))
		    != NULL) {
			cmn_err(CE_WARN, "mc_construct: devgrp %d exists\n",
			    idx);
			continue;
		}

		dgrp = kmem_zalloc(sizeof (struct dgrp_info), KM_SLEEP);

		/* add the entry on device_info list */
		for (j = 0; j < NDIMMS; j++) {
			dmidx = idx * NDIMMS + j;
			dgrp->deviceids[j] = dmidx;
			if ((dev = (struct device_info *)
			    mc_node_get(dmidx, device_head)) != NULL) {
				cmn_err(CE_WARN, "mc_construct: device %d "
				    "exists\n", dmidx);
				continue;
			}
			dev = kmem_zalloc(sizeof (struct device_info),
			    KM_SLEEP);
			dev->dev_node.id = dmidx;
			dev->size = 0;
			(void) strncpy(dev->label, (char *)
			    dimmp->label[i * NDIMMS + j], MAX_DEVLEN);

			mc_node_add((mc_dlist_t *)dev, &device_head,
			    &device_tail);
		}	/* for loop for constructing device_info */

		dgrp->dgrp_node.id = idx;
		dgrp->ndevices = NDIMMS;
		dgrp->size = 0;
		mc_node_add((mc_dlist_t *)dgrp, &dgrp_head, &dgrp_tail);

	}	/* end of for loop for constructing dgrp_info list */

	mctrl->mctrl_node.id = mc_id;
	mctrl->ndevgrps = NDGRPS;
	mc_node_add((mc_dlist_t *)mctrl, &mctrl_head, &mctrl_tail);
	mutex_exit(&mcdatamutex);
}

/*
 * Construct lists for Memory Configuration at logical viewpoint.
 *
 * Retrieve information from Memory Address Decoding Register and set up
 * bank and segment lists. Link bank to its corresponding device group, and
 * update size of device group and devices. Also connect bank to the segment.
 *
 * Memory Address Decoding Register
 * -------------------------------------------------------------------------
 * |63|62    53|52      41|40  37|36     20|19 18|17  14|13 12|11  8|7     0|
 * |-----------|----------|------|---------|-----|------|-----|-----|-------|
 * |V |    -   |    UK    |   -  |    UM   |  -  |  LK  |  -  | LM  |   -   |
 * -------------------------------------------------------------------------
 *
 */

static int
mlayout_add(int mc_id, int bank_no, uint64_t reg, void *dimminfop)
{
	int i, dmidx, idx;
	uint32_t ifactor;
	int status = 0;
	uint64_t size, base;
	struct seg_info *seg_curr;
	struct bank_info *bank_curr;
	struct dgrp_info *dgrp;
	struct device_info *dev;
	union {
		struct {
			uint64_t valid	: 1;
			uint64_t resrv1	: 10;
			uint64_t uk	: 12;
			uint64_t resrv2	: 4;
			uint64_t um	: 17;
			uint64_t resrv3	: 2;
			uint64_t lk	: 4;
			uint64_t resrv4	: 2;
			uint64_t lm	: 4;
			uint64_t resrv5	: 8;
		} _s;
		uint64_t madreg;
	} mcreg;

	mcreg.madreg = reg;

	DPRINTF(MC_CNSTRC_DEBUG, ("mlayout_add: mc_id %d, bank num "
	    "%d, reg 0x%lx\n", mc_id, bank_no, reg));

	/* add the entry on bank_info list */
	idx = mc_id * NBANKS + bank_no;

	mutex_enter(&mcdatamutex);
	if ((bank_curr = (struct bank_info *)mc_node_get(idx, bank_head))
	    != NULL) {
		cmn_err(CE_WARN, "mlayout_add: bank %d exists\n", bank_no);
		goto exit;
	}

	bank_curr = kmem_zalloc(sizeof (struct bank_info), KM_SLEEP);
	bank_curr->bank_node.id = idx;
	bank_curr->valid = mcreg._s.valid;
	bank_curr->dimminfop = dimminfop;

	if (!mcreg._s.valid) {
		mc_node_add((mc_dlist_t *)bank_curr, &bank_head, &bank_tail);
		goto exit;
	}

	/*
	 * size of a logical bank = size of segment / interleave factor
	 * This fomula is not only working for regular configuration,
	 * i.e. number of banks at a segment equals to the max
	 * interleave factor, but also for special case, say 3 bank
	 * interleave. One bank is 2 way interleave and other two are
	 * 4 way. So the sizes of banks are size of segment/2 and /4
	 * respectively.
	 */
	ifactor = (mcreg._s.lk ^ 0xF) + 1;
	size = (((mcreg._s.uk & 0x3FF) + 1) * 0x4000000) / ifactor;
	base = mcreg._s.um & ~mcreg._s.uk;
	base <<= MADR_UPA_SHIFT;

	bank_curr->uk = mcreg._s.uk;
	bank_curr->um = mcreg._s.um;
	bank_curr->lk = mcreg._s.lk;
	bank_curr->lm = mcreg._s.lm;
	bank_curr->size = size;

	/*
	 * The bank's position depends on which halves of the DIMMs it consists
	 * of. The front-side halves of the 4 DIMMs constitute the front bank
	 * and the back-side halves constitute the back bank. Bank numbers
	 * 0 and 1 are front-side banks and bank numbers 2 and 3 are back side
	 * banks.
	 */
	bank_curr->pos = bank_no >> 1;
	ASSERT((bank_curr->pos == 0) || (bank_curr->pos == 1));

	/*
	 * Workaround to keep gcc and SS12 lint happy.
	 * Lint expects lk, uk and um in the format statement below
	 * to use %lx, but this produces a warning when compiled with
	 * gcc.
	 */

#if defined(lint)
	DPRINTF(MC_CNSTRC_DEBUG, ("mlayout_add 3: logical bank num %d, "
	    "lk 0x%lx uk 0x%lx um 0x%lx ifactor 0x%x size 0x%lx base 0x%lx\n",
	    idx, mcreg._s.lk, mcreg._s.uk, mcreg._s.um, ifactor, size, base));
#else /* lint */
	DPRINTF(MC_CNSTRC_DEBUG, ("mlayout_add 3: logical bank num %d, "
	    "lk 0x%x uk 0x%x um 0x%x ifactor 0x%x size 0x%lx base 0x%lx\n",
	    idx, mcreg._s.lk, mcreg._s.uk, mcreg._s.um, ifactor, size, base));
#endif /* lint */

	/* connect the entry and update the size on dgrp_info list */
	idx = mc_id * NDGRPS + (bank_no % NDGRPS);
	if ((dgrp = (struct dgrp_info *)mc_node_get(idx, dgrp_head)) == NULL) {
		/* all avaiable dgrp should be linked at mc_construct */
		cmn_err(CE_WARN, "mlayout_add: dgrp %d doesn't exist\n", idx);
		kmem_free(bank_curr, sizeof (struct bank_info));
		status = -1;
		goto exit;
	}

	bank_curr->devgrp_id = idx;
	dgrp->size += size;

	/* Update the size of entry on device_info list */
	for (i = 0; i < NDIMMS; i++) {
		dmidx = dgrp->dgrp_node.id * NDIMMS + i;
		dgrp->deviceids[i] = dmidx;

		/* avaiable device should be linked at mc_construct */
		if ((dev = (struct device_info *)mc_node_get(dmidx,
		    device_head)) == NULL) {
			cmn_err(CE_WARN, "mlayout_add:dev %d doesn't exist\n",
			    dmidx);
			kmem_free(bank_curr, sizeof (struct bank_info));
			status = -1;
			goto exit;
		}

		dev->size += (size / NDIMMS);

		DPRINTF(MC_CNSTRC_DEBUG, ("mlayout_add DIMM:id %d, size %lu\n",
		    dmidx, size));
	}

	/*
	 * Get the segment by matching the base address, link this bank
	 * to the segment. If not matched, allocate a new segment and
	 * add it at segment list.
	 */
	if (seg_curr = seg_match_base(base)) {
		seg_curr->nbanks++;
		seg_curr->size += size;
		if (ifactor > seg_curr->ifactor)
			seg_curr->ifactor = ifactor;
		bank_curr->seg_id = seg_curr->seg_node.id;
	} else {
		seg_curr = (struct seg_info *)
		    kmem_zalloc(sizeof (struct seg_info), KM_SLEEP);
		bank_curr->seg_id = seg_id;
		seg_curr->seg_node.id = seg_id++;
		seg_curr->base = base;
		seg_curr->size = size;
		seg_curr->nbanks = 1;
		seg_curr->ifactor = ifactor;
		mc_node_add((mc_dlist_t *)seg_curr, &seg_head, &seg_tail);

		nsegments++;
	}

	/* Get the local id of bank which is only unique per segment. */
	bank_curr->local_id = seg_curr->nbanks - 1;

	/* add bank at the end of the list; not sorted by bankid */
	if (seg_curr->hb_inseg != NULL) {
		bank_curr->p_inseg = seg_curr->tb_inseg;
		bank_curr->n_inseg = seg_curr->tb_inseg->n_inseg;
		seg_curr->tb_inseg->n_inseg = bank_curr;
		seg_curr->tb_inseg = bank_curr;
	} else {
		bank_curr->n_inseg = bank_curr->p_inseg = NULL;
		seg_curr->hb_inseg = seg_curr->tb_inseg = bank_curr;
	}
	DPRINTF(MC_CNSTRC_DEBUG, ("mlayout_add: + bank to seg, id %d\n",
	    seg_curr->seg_node.id));

	if (mc_dimm_sids) {
		rw_enter(&mcdimmsids_rw, RW_WRITER);
		mc_update_bank(bank_curr);
		rw_exit(&mcdimmsids_rw);
	}
	mc_node_add((mc_dlist_t *)bank_curr, &bank_head, &bank_tail);

	memsize += size;
	if (seg_curr->nbanks > maxbanks)
		maxbanks = seg_curr->nbanks;

exit:
	mutex_exit(&mcdatamutex);
	return (status);
}

/*
 * Delete nodes related to the given MC on mc, device group, device,
 * and bank lists. Moreover, delete corresponding segment if its connected
 * banks are all removed.
 *
 * The "delete" argument is 1 if this is called as a result of DDI_DETACH. In
 * this case, the DIMM data structures need to be deleted. The argument is
 * 0 if this called as a result of DDI_SUSPEND/DDI_RESUME. In this case,
 * the DIMM data structures are left alone.
 */
static void
mlayout_del(int mc_id, int delete)
{
	int i, j, dgrpid, devid, bankid, ndevgrps;
	struct seg_info *seg;
	struct bank_info *bank_curr;
	struct mctrl_info *mctrl;
	mc_dlist_t *dgrp_ptr;
	mc_dlist_t *dev_ptr;
	uint64_t base;

	mutex_enter(&mcdatamutex);

	/* delete mctrl_info */
	if ((mctrl = (struct mctrl_info *)mc_node_get(mc_id, mctrl_head)) !=
	    NULL) {
		ndevgrps = mctrl->ndevgrps;
		mc_node_del((mc_dlist_t *)mctrl, &mctrl_head, &mctrl_tail);
		kmem_free(mctrl, sizeof (struct mctrl_info));
		nmcs--;

		/*
		 * There is no other list left for disabled MC.
		 */
		if (ndevgrps == 0) {
			mutex_exit(&mcdatamutex);
			return;
		}
	} else
		cmn_err(CE_WARN, "MC mlayout_del: mctrl is not found\n");

	/* Delete device groups and devices of the detached MC */
	for (i = 0; i < NDGRPS; i++) {
		dgrpid = mc_id * NDGRPS + i;
		if (!(dgrp_ptr = mc_node_get(dgrpid, dgrp_head))) {
			cmn_err(CE_WARN, "mlayout_del: no devgrp %d\n", dgrpid);
			continue;
		}

		for (j = 0; j < NDIMMS; j++) {
			devid = dgrpid * NDIMMS + j;
			if (dev_ptr = mc_node_get(devid, device_head)) {
				mc_node_del(dev_ptr, &device_head,
				    &device_tail);
				kmem_free(dev_ptr, sizeof (struct device_info));
			} else {
				cmn_err(CE_WARN, "mlayout_del: no dev %d\n",
				    devid);
			}
		}

		mc_node_del(dgrp_ptr, &dgrp_head, &dgrp_tail);
		kmem_free(dgrp_ptr, sizeof (struct dgrp_info));
	}

	/* Delete banks and segments if it has no bank */
	for (i = 0; i < NBANKS; i++) {
		bankid = mc_id * NBANKS + i;
		DPRINTF(MC_DESTRC_DEBUG, ("bank id %d\n", bankid));
		if (!(bank_curr = (struct bank_info *)mc_node_get(bankid,
		    bank_head))) {
			cmn_err(CE_WARN, "mlayout_del: no bank %d\n", bankid);
			continue;
		}

		if (bank_curr->valid) {
			base = bank_curr->um & ~bank_curr->uk;
			base <<= MADR_UPA_SHIFT;
			bank_curr->valid = 0;
			memsize -= bank_curr->size;

			/* Delete bank at segment and segment if no bank left */
			if (!(seg = seg_match_base(base))) {
				cmn_err(CE_WARN, "mlayout_del: no seg\n");
				mc_node_del((mc_dlist_t *)bank_curr, &bank_head,
				    &bank_tail);
				kmem_free(bank_curr, sizeof (struct bank_info));
				continue;
			}

			/* update the bank list at the segment */
			if (bank_curr->n_inseg == NULL) {
				/* node is at the tail of list */
				seg->tb_inseg = bank_curr->p_inseg;
			} else {
				bank_curr->n_inseg->p_inseg =
				    bank_curr->p_inseg;
			}

			if (bank_curr->p_inseg == NULL) {
				/* node is at the head of list */
				seg->hb_inseg = bank_curr->n_inseg;
			} else {
				bank_curr->p_inseg->n_inseg =
				    bank_curr->n_inseg;
			}

			seg->nbanks--;
			seg->size -= bank_curr->size;

			if (seg->nbanks == 0) {
				mc_node_del((mc_dlist_t *)seg, &seg_head,
				    &seg_tail);
				kmem_free(seg, sizeof (struct seg_info));
				nsegments--;
			}

		}
		mc_node_del((mc_dlist_t *)bank_curr, &bank_head, &bank_tail);
		kmem_free(bank_curr, sizeof (struct bank_info));
	}	/* end of for loop for four banks */

	if (mc_dimm_sids && delete) {
		rw_enter(&mcdimmsids_rw, RW_WRITER);
		i = mc_get_sid_cache_index(mc_id);
		if (i >= 0) {
			mc_dimm_sids[i].state = MC_DIMM_SIDS_INVALID;
			if (mc_dimm_sids[i].sids) {
				kmem_free(mc_dimm_sids[i].sids,
				    sizeof (dimm_sid_t) * (NDGRPS * NDIMMS));
				mc_dimm_sids[i].sids = NULL;
			}
		}
		rw_exit(&mcdimmsids_rw);
	}

	mutex_exit(&mcdatamutex);
}

/*
 * Search the segment in the list starting at seg_head by base address
 * input: base address
 * return: pointer of found segment or null if not found.
 */
static struct seg_info *
seg_match_base(u_longlong_t base)
{
	static struct seg_info *seg_ptr;

	seg_ptr = (struct seg_info *)seg_head;
	while (seg_ptr != NULL) {
		DPRINTF(MC_LIST_DEBUG, ("seg_match: base %lu,given base %llu\n",
		    seg_ptr->base, base));
		if (seg_ptr->base == base)
			break;
		seg_ptr = (struct seg_info *)seg_ptr->seg_node.next;
	}
	return (seg_ptr);
}

/*
 * mc_dlist is a double linking list, including unique id, and pointers to
 * next, and previous nodes. seg_info, bank_info, dgrp_info, device_info,
 * and mctrl_info has it at the top to share the operations, add, del, and get.
 *
 * The new node is added at the tail and is not sorted.
 *
 * Input: The pointer of node to be added, head and tail of the list
 */

static void
mc_node_add(mc_dlist_t *node, mc_dlist_t **head, mc_dlist_t **tail)
{
	DPRINTF(MC_LIST_DEBUG, ("mc_node_add: node->id %d head %p tail %p\n",
	    node->id, (void *)*head, (void *)*tail));

	if (*head != NULL) {
		node->prev = *tail;
		node->next = (*tail)->next;
		(*tail)->next = node;
		*tail = node;
	} else {
		node->next = node->prev = NULL;
		*head = *tail = node;
	}
}

/*
 * Input: The pointer of node to be deleted, head and tail of the list
 *
 * Deleted node will be at the following positions
 * 1. At the tail of the list
 * 2. At the head of the list
 * 3. At the head and tail of the list, i.e. only one left.
 * 4. At the middle of the list
 */

static void
mc_node_del(mc_dlist_t *node, mc_dlist_t **head, mc_dlist_t **tail)
{
	if (node->next == NULL) {
		/* deleted node is at the tail of list */
		*tail = node->prev;
	} else {
		node->next->prev = node->prev;
	}

	if (node->prev == NULL) {
		/* deleted node is at the head of list */
		*head = node->next;
	} else {
		node->prev->next = node->next;
	}
}

/*
 * Search the list from the head of the list to match the given id
 * Input: id and the head of the list
 * Return: pointer of found node
 */
static mc_dlist_t *
mc_node_get(int id, mc_dlist_t *head)
{
	mc_dlist_t *node;

	node = head;
	while (node != NULL) {
		DPRINTF(MC_LIST_DEBUG, ("mc_node_get: id %d, given id %d\n",
		    node->id, id));
		if (node->id == id)
			break;
		node = node->next;
	}
	return (node);
}

/*
 * mc-us3 driver allows a platform to add extra label
 * information to the unum string. If a platform implements a
 * kernel function called plat_add_mem_unum_label() it will be
 * executed. This would typically be implemented in the platmod.
 */
static void
mc_add_mem_unum_label(char *buf, int mcid, int bank, int dimm)
{
	if (&plat_add_mem_unum_label)
		plat_add_mem_unum_label(buf, mcid, bank, dimm);
}

static int
mc_get_sid_cache_index(int mcid)
{
	int	i;

	for (i = 0; i < max_entries; i++) {
		if (mcid == mc_dimm_sids[i].mcid)
			return (i);
	}

	return (-1);
}

static void
mc_update_bank(struct bank_info *bank)
{
	int i, j;
	int bankid, mcid, dgrp_no;

	/*
	 * Mark the MC if DIMM sids are not available.
	 * Mark which segment the DIMMs belong to.  Allocate
	 * space to store DIMM serial ids which are later
	 * provided by the platform layer, and update the bank_info
	 * structure with pointers to its serial ids.
	 */
	bankid = bank->bank_node.id;
	mcid = bankid / NBANKS;
	i = mc_get_sid_cache_index(mcid);
	if (mc_dimm_sids[i].state == MC_DIMM_SIDS_INVALID)
		mc_dimm_sids[i].state = MC_DIMM_SIDS_REQUESTED;

	mc_dimm_sids[i].seg_id = bank->seg_id;

	if (mc_dimm_sids[i].sids == NULL) {
		mc_dimm_sids[i].sids = (dimm_sid_t *)kmem_zalloc(
		    sizeof (dimm_sid_t) * (NDGRPS * NDIMMS), KM_SLEEP);
	}

	dgrp_no = bank->devgrp_id % NDGRPS;

	for (j = 0; j < NDIMMS; j++) {
		bank->dimmsidp[j] =
		    &mc_dimm_sids[i].sids[j + (NDIMMS * dgrp_no)];
	}
}

static int
mc_populate_sid_cache(void)
{
	struct bank_info	*bank;

	if (&plat_populate_sid_cache == 0)
		return (ENOTSUP);

	ASSERT(RW_WRITE_HELD(&mcdimmsids_rw));

	bank = (struct bank_info *)bank_head;
	while (bank != NULL) {
		if (!bank->valid) {
			bank = (struct bank_info *)bank->bank_node.next;
			continue;
		}

		mc_update_bank(bank);

		bank = (struct bank_info *)bank->bank_node.next;
	}


	/*
	 * Call to the platform layer to populate the cache
	 * with DIMM serial ids.
	 */
	return (plat_populate_sid_cache(mc_dimm_sids, max_entries));
}

static void
mc_init_sid_cache_thr(void)
{
	ASSERT(mc_dimm_sids == NULL);

	mutex_enter(&mcdatamutex);
	rw_enter(&mcdimmsids_rw, RW_WRITER);

	mc_dimm_sids = plat_alloc_sid_cache(&max_entries);
	(void) mc_populate_sid_cache();

	rw_exit(&mcdimmsids_rw);
	mutex_exit(&mcdatamutex);
}

static int
mc_init_sid_cache(void)
{
	if (&plat_alloc_sid_cache) {
		(void) thread_create(NULL, 0, mc_init_sid_cache_thr, NULL, 0,
		    &p0, TS_RUN, minclsyspri);
		return (0);
	} else
		return (ENOTSUP);
}

static int
mc_get_mem_sid(int mcid, int dimm, char *buf, int buflen, int *lenp)
{
	int	i;

	if (buflen < DIMM_SERIAL_ID_LEN)
		return (ENOSPC);

	/*
	 * If DIMM serial ids have not been cached yet, tell the
	 * caller to try again.
	 */
	if (!rw_tryenter(&mcdimmsids_rw, RW_READER))
		return (EAGAIN);

	if (mc_dimm_sids == NULL) {
		rw_exit(&mcdimmsids_rw);
		return (EAGAIN);
	}

	/*
	 * Find dimm serial id using mcid and dimm #
	 */
	for (i = 0; i < max_entries; i++) {
		if (mc_dimm_sids[i].mcid == mcid)
			break;
	}
	if ((i == max_entries) || (!mc_dimm_sids[i].sids)) {
		rw_exit(&mcdimmsids_rw);
		return (ENOENT);
	}

	(void) strlcpy(buf, mc_dimm_sids[i].sids[dimm],
	    DIMM_SERIAL_ID_LEN);
	*lenp = strlen(buf);

	rw_exit(&mcdimmsids_rw);
	return (0);
}