/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * sun4v Memory DR Module
 */


#include <sys/types.h>
#include <sys/cmn_err.h>
#include <sys/vmem.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/machsystm.h>	/* for page_freelist_coalesce() */
#include <sys/errno.h>
#include <sys/memnode.h>
#include <sys/memlist.h>
#include <sys/memlist_impl.h>
#include <sys/tuneable.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/debug.h>
#include <sys/vm.h>
#include <sys/callb.h>
#include <sys/memlist_plat.h>	/* for installed_top_size() */
#include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
#include <sys/dumphdr.h>	/* for dump_resize() */
#include <sys/atomic.h>		/* for use in stats collection */
#include <sys/rwlock.h>
#include <vm/seg_kmem.h>
#include <vm/seg_kpm.h>
#include <vm/page.h>
#include <vm/vm_dep.h>
#define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
#include <sys/sunddi.h>
#include <sys/mem_config.h>
#include <sys/mem_cage.h>
#include <sys/lgrp.h>
#include <sys/ddi.h>

#include <sys/modctl.h>
#include <sys/sysevent/dr.h>
#include <sys/mach_descrip.h>
#include <sys/mdesc.h>
#include <sys/ds.h>
#include <sys/drctl.h>
#include <sys/dr_util.h>
#include <sys/dr_mem.h>
#include <sys/suspend.h>


/*
 * DR operations are subject to Memory Alignment restrictions
 * for both address and the size of the request.
 */
#define	MA_ADDR	0x10000000	/* addr alignment 256M */
#define	MA_SIZE	0x10000000	/* size alignment 256M */

#define	MBLK_IS_VALID(m) \
	(IS_P2ALIGNED((m)->addr, MA_ADDR) && IS_P2ALIGNED((m)->size, MA_SIZE))

static memhandle_t dr_mh;	/* memory handle for delete */

static struct modlmisc modlmisc = {
	&mod_miscops,
	"sun4v memory DR"
};

static struct modlinkage modlinkage = {
	MODREV_1,
	(void *)&modlmisc,
	NULL
};

static int dr_mem_allow_unload = 0;

typedef int (*fn_t)(dr_mem_blk_t *, int *);

/*
 * Global Domain Services (DS) Handle
 */
static ds_svc_hdl_t ds_handle;

/*
 * Supported DS Capability Versions
 */
static ds_ver_t		dr_mem_vers[] = { { 1, 0 } };
#define	DR_MEM_NVERS	(sizeof (dr_mem_vers) / sizeof (dr_mem_vers[0]))

/*
 * DS Capability Description
 */
static ds_capability_t dr_mem_cap = {
	DR_MEM_DS_ID,		/* svc_id */
	dr_mem_vers,		/* vers */
	DR_MEM_NVERS		/* nvers */
};

/*
 * DS Callbacks
 */
static void dr_mem_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
static void dr_mem_unreg_handler(ds_cb_arg_t arg);
static void dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);

/*
 * DS Client Ops Vector
 */
static ds_clnt_ops_t dr_mem_ops = {
	dr_mem_reg_handler,	/* ds_reg_cb */
	dr_mem_unreg_handler,	/* ds_unreg_cb */
	dr_mem_data_handler,	/* ds_data_cb */
	NULL			/* cb_arg */
};

/*
 * Operation Results
 *
 * Used internally to gather results while an operation on a
 * list of mblks is in progress. In particular, it is used to
 * keep track of which mblks have already failed so that they are
 * not processed further, and the manner in which they failed.
 */
typedef struct {
	uint64_t	addr;
	uint64_t	size;
	uint32_t	result;
	uint32_t	status;
	char		*string;
} dr_mem_res_t;

static char *
dr_mem_estr[] = {
	"operation succeeded",		/* DR_MEM_RES_OK */
	"operation failed",		/* DR_MEM_RES_FAILURE */
	"operation was blocked",	/* DR_MEM_RES_BLOCKED */
	"memory not defined in MD",	/* DR_MEM_RES_NOT_IN_MD */
	"memory already in use",	/* DR_MEM_RES_ESPAN */
	"memory access test failed",	/* DR_MEM_RES_EFAULT */
	"resource not available",	/* DR_MEM_RES_ERESOURCE */
	"permanent pages in span",	/* DR_MEM_RES_PERM */
	"memory span busy",		/* DR_MEM_RES_EBUSY */
	"VM viability test failed",	/* DR_MEM_RES_ENOTVIABLE */
	"no pages to unconfigure",	/* DR_MEM_RES_ENOWORK */
	"operation cancelled",		/* DR_MEM_RES_ECANCELLED */
	"operation refused",		/* DR_MEM_RES_EREFUSED */
	"memory span duplicate",	/* DR_MEM_RES_EDUP */
	"invalid argument"		/* DR_MEM_RES_EINVAL */
};

static char *
dr_mem_estr_detail[] = {
	"",					/* DR_MEM_SRES_NONE */
	"memory DR disabled after migration"	/* DR_MEM_SRES_OS_SUSPENDED */
};

typedef struct {
	kcondvar_t cond;
	kmutex_t lock;
	int error;
	int done;
} mem_sync_t;

/*
 * Internal Functions
 */
static int dr_mem_init(void);
static int dr_mem_fini(void);

static int dr_mem_list_wrk(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
static int dr_mem_list_query(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
static int dr_mem_del_stat(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
static int dr_mem_del_cancel(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);

static int dr_mem_unconfigure(dr_mem_blk_t *, int *);
static int dr_mem_configure(dr_mem_blk_t *, int *);
static void dr_mem_query(dr_mem_blk_t *, dr_mem_query_t *);

static dr_mem_res_t *dr_mem_res_array_init(dr_mem_hdr_t *, drctl_rsrc_t *, int);
static void dr_mem_res_array_fini(dr_mem_res_t *res, int nres);
static size_t dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res,
    dr_mem_hdr_t **respp);

static int dr_mem_find(dr_mem_blk_t *mbp);
static mde_cookie_t dr_mem_find_node_md(dr_mem_blk_t *, md_t *, mde_cookie_t *);

static int mem_add(pfn_t, pgcnt_t);
static int mem_del(pfn_t, pgcnt_t);

extern int kphysm_add_memory_dynamic(pfn_t, pgcnt_t);

int
_init(void)
{
	int	status;

	/* check that Memory DR is enabled */
	if (dr_is_disabled(DR_TYPE_MEM))
		return (ENOTSUP);

	if ((status = dr_mem_init()) != 0) {
		cmn_err(CE_NOTE, "Memory DR initialization failed");
		return (status);
	}

	if ((status = mod_install(&modlinkage)) != 0) {
		(void) dr_mem_fini();
	}

	return (status);
}

int
_info(struct modinfo *modinfop)
{
	return (mod_info(&modlinkage, modinfop));
}

int
_fini(void)
{
	int	status;

	if (dr_mem_allow_unload == 0)
		return (EBUSY);

	if ((status = mod_remove(&modlinkage)) == 0) {
		(void) dr_mem_fini();
	}

	return (status);
}

static int
dr_mem_init(void)
{
	int rv;

	if ((rv = ds_cap_init(&dr_mem_cap, &dr_mem_ops)) != 0) {
		cmn_err(CE_NOTE, "dr_mem: ds_cap_init failed: %d", rv);
		return (rv);
	}

	return (0);
}

static int
dr_mem_fini(void)
{
	int rv;

	if ((rv = ds_cap_fini(&dr_mem_cap)) != 0) {
		cmn_err(CE_NOTE, "dr_mem: ds_cap_fini failed: %d", rv);
	}

	return (rv);
}

static void
dr_mem_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
{
	DR_DBG_MEM("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg,
	    ver->major, ver->minor, hdl);

	ds_handle = hdl;
}

static void
dr_mem_unreg_handler(ds_cb_arg_t arg)
{
	DR_DBG_MEM("unreg_handler: arg=0x%p\n", arg);

	ds_handle = DS_INVALID_HDL;
}

/*ARGSUSED*/
static void
dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
{
	dr_mem_hdr_t	*req = buf;
	dr_mem_hdr_t	err_resp;
	dr_mem_hdr_t	*resp = &err_resp;
	int		resp_len = 0;
	int		rv = EINVAL;

	/*
	 * Sanity check the message
	 */
	if (buflen < sizeof (dr_mem_hdr_t)) {
		DR_DBG_MEM("incoming message short: expected at least %ld "
		    "bytes, received %ld\n", sizeof (dr_mem_hdr_t), buflen);
		goto done;
	}

	if (req == NULL) {
		DR_DBG_MEM("empty message: expected at least %ld bytes\n",
		    sizeof (dr_mem_hdr_t));
		goto done;
	}

	DR_DBG_MEM("incoming request:\n");
	DR_DBG_DUMP_MSG(buf, buflen);

	/*
	 * Process the command
	 */
	switch (req->msg_type) {
	case DR_MEM_CONFIGURE:
	case DR_MEM_UNCONFIGURE:
		if (req->msg_arg == 0) {
			DR_DBG_MEM("No mblks specified for operation\n");
			goto done;
		}
		if ((rv = dr_mem_list_wrk(req, &resp, &resp_len)) != 0) {
			DR_DBG_MEM("%s failed (%d)\n",
			    (req->msg_type == DR_MEM_CONFIGURE) ?
			    "Memory configure" : "Memory unconfigure", rv);
		}
		break;

	case DR_MEM_UNCONF_STATUS:
		if ((rv = dr_mem_del_stat(req, &resp, &resp_len)) != 0)
			DR_DBG_MEM("Memory delete status failed (%d)\n", rv);
		break;

	case DR_MEM_UNCONF_CANCEL:
		if ((rv = dr_mem_del_cancel(req, &resp, &resp_len)) != 0)
			DR_DBG_MEM("Memory delete cancel failed (%d)\n", rv);
		break;

	case DR_MEM_QUERY:
		if (req->msg_arg == 0) {
			DR_DBG_MEM("No mblks specified for operation\n");
			goto done;
		}
		if ((rv = dr_mem_list_query(req, &resp, &resp_len)) != 0)
			DR_DBG_MEM("Memory query failed (%d)\n", rv);
		break;

	default:
		cmn_err(CE_NOTE, "unsupported memory DR operation (%d)",
		    req->msg_type);
		break;
	}

done:
	/* check if an error occurred */
	if (resp == &err_resp) {
		resp->req_num = (req) ? req->req_num : 0;
		resp->msg_type = DR_MEM_ERROR;
		resp->msg_arg = rv;
		resp_len = sizeof (dr_mem_hdr_t);
	}

	DR_DBG_MEM("outgoing response:\n");
	DR_DBG_DUMP_MSG(resp, resp_len);

	/* send back the response */
	if (ds_cap_send(ds_handle, resp, resp_len) != 0) {
		DR_DBG_MEM("ds_send failed\n");
	}

	/* free any allocated memory */
	if (resp != &err_resp) {
		kmem_free(resp, resp_len);
	}
}

static char *
dr_mem_get_errstr(int result, int subresult)
{
	size_t len;
	char *errstr;
	const char *separator = ": ";

	if (subresult == DR_MEM_SRES_NONE)
		return (i_ddi_strdup(dr_mem_estr[result], KM_SLEEP));

	len = snprintf(NULL, 0, "%s%s%s", dr_mem_estr[result],
	    separator, dr_mem_estr_detail[subresult]) + 1;

	errstr = kmem_alloc(len, KM_SLEEP);

	(void) snprintf(errstr, len, "%s%s%s", dr_mem_estr[result],
	    separator, dr_mem_estr_detail[subresult]);

	return (errstr);
}

/*
 * Common routine to config or unconfig multiple mblks.
 *
 * Note: Do not modify result buffer or length on error.
 */
static int
dr_mem_list_wrk(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
{
	int		rv;
	int		idx;
	int		count;
	int		result;
	int		subresult;
	int		status;
	boolean_t	suspend_allows_dr;
	fn_t		dr_fn;
	int		se_hint;
	dr_mem_blk_t	*req_mblks;
	dr_mem_res_t	*res;
	int		drctl_cmd;
	int		drctl_flags = 0;
	drctl_rsrc_t	*drctl_req;
	size_t		drctl_req_len;
	drctl_resp_t	*drctl_resp;
	drctl_rsrc_t	*drctl_rsrc;
	size_t		drctl_resp_len = 0;
	drctl_cookie_t	drctl_res_ck;

	ASSERT((req != NULL) && (req->msg_arg != 0));

	count = req->msg_arg;

	/*
	 * Extract all information that is specific
	 * to the various types of operations.
	 */
	switch (req->msg_type) {
	case DR_MEM_CONFIGURE:
		dr_fn = dr_mem_configure;
		drctl_cmd = DRCTL_MEM_CONFIG_REQUEST;
		se_hint = SE_HINT_INSERT;
		break;
	case DR_MEM_UNCONFIGURE:
		dr_fn = dr_mem_unconfigure;
		drctl_cmd = DRCTL_MEM_UNCONFIG_REQUEST;
		se_hint = SE_HINT_REMOVE;
		break;
	default:
		/* Programming error if we reach this. */
		cmn_err(CE_NOTE, "%s: bad msg_type %d\n",
		    __func__, req->msg_type);
		ASSERT(0);
		return (-1);
	}

	/* the incoming array of mblks to operate on */
	req_mblks = DR_MEM_CMD_MBLKS(req);

	/* allocate drctl request msg based on incoming resource count */
	drctl_req_len = sizeof (drctl_rsrc_t) * count;
	drctl_req = kmem_zalloc(drctl_req_len, KM_SLEEP);

	/* copy the size for the drctl call from the incoming request msg */
	for (idx = 0; idx < count; idx++) {
		drctl_req[idx].res_mem_addr = req_mblks[idx].addr;
		drctl_req[idx].res_mem_size = req_mblks[idx].size;
	}

	rv = drctl_config_init(drctl_cmd, drctl_flags, drctl_req,
	    count, &drctl_resp, &drctl_resp_len, &drctl_res_ck);

	ASSERT((drctl_resp != NULL) && (drctl_resp_len != 0));

	if (rv != 0) {
		DR_DBG_MEM("%s: drctl_config_init returned: %d\n",
		    __func__, rv);
		kmem_free(drctl_resp, drctl_resp_len);
		kmem_free(drctl_req, drctl_req_len);
		return (rv);
	}

	ASSERT(drctl_resp->resp_type == DRCTL_RESP_OK);

	drctl_rsrc = drctl_resp->resp_resources;

	/* create the result scratch array */
	res = dr_mem_res_array_init(req, drctl_rsrc, count);

	/*
	 * Memory DR operations are not safe if we have been suspended and
	 * resumed. Until this limitation is lifted, check to see if memory
	 * DR operations are permitted at this time by the suspend subsystem.
	 */
	if ((suspend_allows_dr = suspend_memdr_allowed()) == B_FALSE) {
		result = DR_MEM_RES_BLOCKED;
		subresult = DR_MEM_SRES_OS_SUSPENDED;
	} else {
		subresult = DR_MEM_SRES_NONE;
	}

	/* perform the specified operation on each of the mblks */
	for (idx = 0; idx < count; idx++) {
		/*
		 * If no action will be taken against the current
		 * mblk, update the drctl resource information to
		 * ensure that it gets recovered properly during
		 * the drctl fini() call.
		 */
		if (res[idx].result != DR_MEM_RES_OK) {
			drctl_req[idx].status = DRCTL_STATUS_CONFIG_FAILURE;
			continue;
		}

		/*
		 * If memory DR operations are permitted at this time by
		 * the suspend subsystem, call the function to perform the
		 * operation, otherwise return a result indicating that the
		 * operation was blocked.
		 */
		if (suspend_allows_dr)
			result = (*dr_fn)(&req_mblks[idx], &status);

		/* save off results of the operation */
		res[idx].result = result;
		res[idx].status = status;
		res[idx].addr = req_mblks[idx].addr;	/* for partial case */
		res[idx].size = req_mblks[idx].size;	/* for partial case */
		res[idx].string = dr_mem_get_errstr(result, subresult);

		/* save result for drctl fini() reusing init() msg memory */
		drctl_req[idx].status = (result != DR_MEM_RES_OK) ?
		    DRCTL_STATUS_CONFIG_FAILURE : DRCTL_STATUS_CONFIG_SUCCESS;

		DR_DBG_MEM("%s: mblk 0x%lx.0x%lx stat %d result %d off '%s'\n",
		    __func__, req_mblks[idx].addr, req_mblks[idx].size,
		    drctl_req[idx].status, result,
		    (res[idx].string) ? res[idx].string : "");
	}

	if ((rv = drctl_config_fini(&drctl_res_ck, drctl_req, count)) != 0)
		DR_DBG_MEM("%s: drctl_config_fini returned: %d\n",
		    __func__, rv);

	/*
	 * Operation completed without any fatal errors.
	 * Pack the response for transmission.
	 */
	*resp_len = dr_mem_pack_response(req, res, resp);

	/* notify interested parties about the operation */
	dr_generate_event(DR_TYPE_MEM, se_hint);

	/*
	 * Deallocate any scratch memory.
	 */
	kmem_free(drctl_resp, drctl_resp_len);
	kmem_free(drctl_req, drctl_req_len);

	dr_mem_res_array_fini(res, count);

	return (0);
}

/*
 * Allocate and initialize a result array based on the initial
 * drctl operation. A valid result array is always returned.
 */
static dr_mem_res_t *
dr_mem_res_array_init(dr_mem_hdr_t *req, drctl_rsrc_t *rsrc, int nrsrc)
{
	int		idx;
	dr_mem_res_t	*res;
	char		*err_str;
	size_t		err_len;

	/* allocate zero filled buffer to initialize fields */
	res = kmem_zalloc(nrsrc * sizeof (dr_mem_res_t), KM_SLEEP);

	/*
	 * Fill in the result information for each resource.
	 */
	for (idx = 0; idx < nrsrc; idx++) {
		res[idx].addr = rsrc[idx].res_mem_addr;
		res[idx].size = rsrc[idx].res_mem_size;
		res[idx].result = DR_MEM_RES_OK;

		if (rsrc[idx].status == DRCTL_STATUS_ALLOW)
			continue;

		/*
		 * Update the state information for this mblk.
		 */
		res[idx].result = DR_MEM_RES_BLOCKED;
		res[idx].status = (req->msg_type == DR_MEM_CONFIGURE) ?
		    DR_MEM_STAT_UNCONFIGURED : DR_MEM_STAT_CONFIGURED;

		/*
		 * If an error string exists, copy it out of the
		 * message buffer. This eliminates any dependency
		 * on the memory allocated for the message buffer
		 * itself.
		 */
		if (rsrc[idx].offset != NULL) {
			err_str = (char *)rsrc + rsrc[idx].offset;
			err_len = strlen(err_str) + 1;

			res[idx].string = kmem_alloc(err_len, KM_SLEEP);
			bcopy(err_str, res[idx].string, err_len);
		}
	}

	return (res);
}

static void
dr_mem_res_array_fini(dr_mem_res_t *res, int nres)
{
	int	idx;
	size_t	str_len;

	for (idx = 0; idx < nres; idx++) {
		/* deallocate the error string if present */
		if (res[idx].string) {
			str_len = strlen(res[idx].string) + 1;
			kmem_free(res[idx].string, str_len);
		}
	}

	/* deallocate the result array itself */
	kmem_free(res, sizeof (dr_mem_res_t) * nres);
}

/*
 * Allocate and pack a response message for transmission based
 * on the specified result array. A valid response message and
 * valid size information is always returned.
 */
static size_t
dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res, dr_mem_hdr_t **respp)
{
	int		idx;
	dr_mem_hdr_t	*resp;
	dr_mem_stat_t	*resp_stat;
	size_t		resp_len;
	uint32_t	curr_off;
	caddr_t		curr_str;
	size_t		str_len;
	size_t		stat_len;
	int		nstat = req->msg_arg;

	/*
	 * Calculate the size of the response message
	 * and allocate an appropriately sized buffer.
	 */
	resp_len = sizeof (dr_mem_hdr_t);

	/* add the stat array size */
	stat_len = sizeof (dr_mem_stat_t) * nstat;
	resp_len += stat_len;

	/* add the size of any error strings */
	for (idx = 0; idx < nstat; idx++) {
		if (res[idx].string != NULL) {
			resp_len += strlen(res[idx].string) + 1;
		}
	}

	/* allocate the message buffer */
	resp = kmem_zalloc(resp_len, KM_SLEEP);

	/*
	 * Fill in the header information.
	 */
	resp->req_num = req->req_num;
	resp->msg_type = DR_MEM_OK;
	resp->msg_arg = nstat;

	/*
	 * Fill in the stat information.
	 */
	resp_stat = DR_MEM_RESP_STATS(resp);

	/* string offsets start immediately after stat array */
	curr_off = sizeof (dr_mem_hdr_t) + stat_len;
	curr_str = (char *)resp_stat + stat_len;

	for (idx = 0; idx < nstat; idx++) {
		resp_stat[idx].addr = res[idx].addr;
		resp_stat[idx].size = res[idx].size;
		resp_stat[idx].result = res[idx].result;
		resp_stat[idx].status = res[idx].status;

		if (res[idx].string != NULL) {
			/* copy over the error string */
			str_len = strlen(res[idx].string) + 1;
			bcopy(res[idx].string, curr_str, str_len);
			resp_stat[idx].string_off = curr_off;

			curr_off += str_len;
			curr_str += str_len;
		}
	}

	/* buffer should be exactly filled */
	ASSERT(curr_off == resp_len);

	*respp = resp;
	return (resp_len);
}

static void
dr_mem_query(dr_mem_blk_t *mbp, dr_mem_query_t *mqp)
{
	memquery_t mq;

	DR_DBG_MEM("dr_mem_query...\n");


	(void) kphysm_del_span_query(btop(mbp->addr), btop(mbp->size), &mq);

	if (!mq.phys_pages)
		return;

	mqp->addr = mbp->addr;
	mqp->mq.phys_pages = ptob(mq.phys_pages);
	mqp->mq.managed = ptob(mq.managed);
	mqp->mq.nonrelocatable = ptob(mq.nonrelocatable);
	mqp->mq.first_nonrelocatable = ptob(mq.first_nonrelocatable);
	mqp->mq.last_nonrelocatable = ptob(mq.last_nonrelocatable);
	/*
	 * Set to the max byte offset within the page.
	 */
	if (mqp->mq.nonrelocatable)
		mqp->mq.last_nonrelocatable += PAGESIZE - 1;
}

/*
 * Do not modify result buffer or length on error.
 */
static int
dr_mem_list_query(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
{
	int		idx;
	int		rlen;
	int		nml;
	struct memlist	*ml;
	struct memlist	*phys_copy = NULL;
	dr_mem_blk_t	*req_mblks, mb;
	dr_mem_hdr_t	*rp;
	dr_mem_query_t	*stat;

	drctl_block();

	/* the incoming array of req_mblks to configure */
	req_mblks = DR_MEM_CMD_MBLKS(req);

	/* allocate a response message, should be freed by caller */
	nml = 0;
	rlen = sizeof (dr_mem_hdr_t);
	if (req_mblks->addr == NULL && req_mblks->size == 0) {
		/*
		 * Request is for domain's full view of it's memory.
		 * place a copy in phys_copy then release the memlist lock.
		 */
		memlist_read_lock();
		phys_copy = dr_memlist_dup(phys_install);
		memlist_read_unlock();

		for (ml = phys_copy; ml; ml = ml->ml_next)
			nml++;

		rlen += nml * sizeof (dr_mem_query_t);
	} else {
		rlen += req->msg_arg * sizeof (dr_mem_query_t);
	}
	rp = kmem_zalloc(rlen, KM_SLEEP);

	/* fill in the known data */
	rp->req_num = req->req_num;
	rp->msg_type = DR_MEM_OK;
	rp->msg_arg = nml ? nml : req->msg_arg;

	/* stat array for the response */
	stat = DR_MEM_RESP_QUERY(rp);

	/* get the status for each of the mblocks */
	if (nml) {
		for (idx = 0, ml = phys_copy; ml; ml = ml->ml_next, idx++) {
			mb.addr = ml->ml_address;
			mb.size = ml->ml_size;
			dr_mem_query(&mb, &stat[idx]);
		}
	} else {
		for (idx = 0; idx < req->msg_arg; idx++)
			dr_mem_query(&req_mblks[idx], &stat[idx]);
	}

	*resp = rp;
	*resp_len = rlen;
	if (phys_copy != NULL) {
		dr_memlist_delete(phys_copy);
	}
	drctl_unblock();

	return (0);
}

static int
cvt_err(int err)
{
	int rv;

	switch (err) {
	case KPHYSM_OK:
		rv = DR_MEM_RES_OK;
		break;
	case KPHYSM_ESPAN:
		rv = DR_MEM_RES_ESPAN;
		break;
	case KPHYSM_EFAULT:
		rv = DR_MEM_RES_EFAULT;
		break;
	case KPHYSM_ERESOURCE:
		rv = DR_MEM_RES_ERESOURCE;
		break;
	case KPHYSM_ENOTSUP:
	case KPHYSM_ENOHANDLES:
		rv = DR_MEM_RES_FAILURE;
		break;
	case KPHYSM_ENONRELOC:
		rv = DR_MEM_RES_PERM;
		break;
	case KPHYSM_EHANDLE:
		rv = DR_MEM_RES_FAILURE;
		break;
	case KPHYSM_EBUSY:
		rv = DR_MEM_RES_EBUSY;
		break;
	case KPHYSM_ENOTVIABLE:
		rv = DR_MEM_RES_ENOTVIABLE;
		break;
	case KPHYSM_ESEQUENCE:
		rv = DR_MEM_RES_FAILURE;
		break;
	case KPHYSM_ENOWORK:
		rv = DR_MEM_RES_ENOWORK;
		break;
	case KPHYSM_ECANCELLED:
		rv = DR_MEM_RES_ECANCELLED;
		break;
	case KPHYSM_EREFUSED:
		rv = DR_MEM_RES_EREFUSED;
		break;
	case KPHYSM_ENOTFINISHED:
	case KPHYSM_ENOTRUNNING:
		rv = DR_MEM_RES_FAILURE;
		break;
	case KPHYSM_EDUP:
		rv = DR_MEM_RES_EDUP;
		break;
	default:
		rv = DR_MEM_RES_FAILURE;
		break;
	}

	return (rv);
}

static int
dr_mem_configure(dr_mem_blk_t *mbp, int *status)
{
	int rv;
	uint64_t addr, size;

	rv = 0;
	addr = mbp->addr;
	size = mbp->size;

	DR_DBG_MEM("dr_mem_configure...\n");

	if (!MBLK_IS_VALID(mbp)) {
		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n", addr, size);
		*status = DR_MEM_STAT_UNCONFIGURED;
		rv = DR_MEM_RES_EINVAL;
	} else if (rv = dr_mem_find(mbp)) {
		DR_DBG_MEM("failed to find mblk 0x%lx.0x%lx (%d)\n",
		    addr, size, rv);
		if (rv == EINVAL) {
			*status = DR_MEM_STAT_NOT_PRESENT;
			rv = DR_MEM_RES_NOT_IN_MD;
		} else {
			*status = DR_MEM_STAT_UNCONFIGURED;
			rv = DR_MEM_RES_FAILURE;
		}
	} else {
		rv = mem_add(btop(addr), btop(size));
		DR_DBG_MEM("addr=0x%lx size=0x%lx rv=%d\n", addr, size, rv);
		if (rv) {
			*status = DR_MEM_STAT_UNCONFIGURED;
		} else {
			*status = DR_MEM_STAT_CONFIGURED;
		}
	}

	return (rv);
}

static int
dr_mem_unconfigure(dr_mem_blk_t *mbp, int *status)
{
	int rv;

	DR_DBG_MEM("dr_mem_unconfigure...\n");

	if (!MBLK_IS_VALID(mbp)) {
		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n",
		    mbp->addr, mbp->size);
			*status = DR_MEM_STAT_CONFIGURED;
			rv = DR_MEM_RES_EINVAL;
	} else if (rv = mem_del(btop(mbp->addr), btop(mbp->size))) {
		*status = DR_MEM_STAT_CONFIGURED;
	} else {
		*status = DR_MEM_STAT_UNCONFIGURED;
		rv = DR_MEM_RES_OK;
		DR_DBG_MEM("mblk 0x%lx.0x%lx unconfigured\n",
		    mbp->addr, mbp->size);
	}
	return (rv);
}

static int
dr_mem_del_stat(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
{
	int			status;
	int			rlen;
	memdelstat_t		del_stat, *stat;
	dr_mem_hdr_t		*rp;

	/*
	 * If a mem delete is in progress, get its status.
	 */
	status = (dr_mh && (kphysm_del_status(dr_mh, &del_stat) == KPHYSM_OK));

	/* allocate a response message, should be freed by caller */
	rlen = sizeof (dr_mem_hdr_t);
	rlen += status * sizeof (memdelstat_t);
	rp = kmem_zalloc(rlen, KM_SLEEP);

	/* fill in the known data */
	rp->req_num = req->req_num;
	rp->msg_type = DR_MEM_OK;
	rp->msg_arg = status;

	if (status) {
		/* stat struct for the response */
		stat = DR_MEM_RESP_DEL_STAT(rp);
		stat->phys_pages = ptob(del_stat.phys_pages);
		stat->managed = ptob(del_stat.managed);
		stat->collected = ptob(del_stat.collected);
	}

	*resp = rp;
	*resp_len = rlen;

	return (0);
}

static int
dr_mem_del_cancel(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
{
	int		rlen;
	dr_mem_hdr_t	*rp;

	/* allocate a response message, should be freed by caller */
	rlen = sizeof (dr_mem_hdr_t);
	rp = kmem_zalloc(rlen, KM_SLEEP);

	/* fill in the known data */
	rp->req_num = req->req_num;
	rp->msg_type = DR_MEM_OK;
	rp->msg_arg = (dr_mh && kphysm_del_cancel(dr_mh) != KPHYSM_OK) ?
	    DR_MEM_RES_EINVAL : DR_MEM_RES_OK;

	*resp = rp;
	*resp_len = rlen;

	return (0);
}

static int
dr_mem_find(dr_mem_blk_t *mbp)
{
	md_t		*mdp = NULL;
	int		num_nodes;
	int		rv = 0;
	int		listsz;
	mde_cookie_t	*listp = NULL;
	mde_cookie_t	memnode;
	char		*found = "found";

	if ((mdp = md_get_handle()) == NULL) {
		DR_DBG_MEM("unable to initialize machine description\n");
		return (-1);
	}

	num_nodes = md_node_count(mdp);
	ASSERT(num_nodes > 0);

	listsz = num_nodes * sizeof (mde_cookie_t);
	listp = kmem_zalloc(listsz, KM_SLEEP);

	memnode = dr_mem_find_node_md(mbp, mdp, listp);

	if (memnode == MDE_INVAL_ELEM_COOKIE) {
		rv = EINVAL;
		found = "not found";
	}

	DR_DBG_MEM("mblk 0x%lx.0x%lx %s\n", mbp->addr, mbp->size, found);

	kmem_free(listp, listsz);
	(void) md_fini_handle(mdp);

	return (rv);
}

/*
 * Look up a particular mblk in the MD. Returns the mde_cookie_t
 * representing that mblk if present, and MDE_INVAL_ELEM_COOKIE
 * otherwise. It is assumed the scratch array has already been
 * allocated so that it can accommodate the worst case scenario,
 * every node in the MD.
 */
static mde_cookie_t
dr_mem_find_node_md(dr_mem_blk_t *mbp, md_t *mdp, mde_cookie_t *listp)
{
	int		idx;
	int		nnodes;
	mde_cookie_t	rootnode;
	uint64_t	base_prop;
	uint64_t	size_prop;
	mde_cookie_t	result = MDE_INVAL_ELEM_COOKIE;

	rootnode = md_root_node(mdp);
	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);

	/*
	 * Scan the DAG for all the mem nodes
	 */
	nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "mblock"),
	    md_find_name(mdp, "fwd"), listp);

	if (nnodes < 0) {
		DR_DBG_MEM("Scan for mblks failed\n");
		return (result);
	}

	DR_DBG_MEM("dr_mem_find_node_md: found %d mblks in the MD\n", nnodes);

	/*
	 * Find the mblk of interest
	 */
	for (idx = 0; idx < nnodes; idx++) {

		if (md_get_prop_val(mdp, listp[idx], "base", &base_prop)) {
			DR_DBG_MEM("Missing 'base' property for mblk node %d\n",
			    idx);
			break;
		}

		if (md_get_prop_val(mdp, listp[idx], "size", &size_prop)) {
			DR_DBG_MEM("Missing 'size' property for mblk node %d\n",
			    idx);
			break;
		}

		if (base_prop <= mbp->addr &&
		    (base_prop + size_prop) >= (mbp->addr + mbp->size)) {
			/* found a match */
			DR_DBG_MEM("dr_mem_find_node_md: found mblk "
			    "0x%lx.0x%lx in MD\n", mbp->addr, mbp->size);
			result = listp[idx];
			break;
		}
	}

	if (result == MDE_INVAL_ELEM_COOKIE) {
		DR_DBG_MEM("mblk 0x%lx.0x%lx not in MD\n",
		    mbp->addr, mbp->size);
	}

	return (result);
}

static int
mem_add(pfn_t base, pgcnt_t npgs)
{
	int rv, rc;

	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);

	if (npgs == 0)
		return (DR_MEM_RES_OK);

	rv = kphysm_add_memory_dynamic(base, npgs);
	DR_DBG_MEM("%s: kphysm_add(0x%lx, 0x%lx) = %d", __func__, base, npgs,
	    rv);
	if (rv == KPHYSM_OK) {
		if (rc = kcage_range_add(base, npgs, KCAGE_DOWN))
			cmn_err(CE_WARN, "kcage_range_add() = %d", rc);
	}
	rv = cvt_err(rv);
	return (rv);
}

static void
del_done(void *arg, int error)
{
	mem_sync_t *ms = arg;

	mutex_enter(&ms->lock);
	ms->error = error;
	ms->done = 1;
	cv_signal(&ms->cond);
	mutex_exit(&ms->lock);
}

static int
mem_del(pfn_t base, pgcnt_t npgs)
{
	int rv, err, del_range = 0;
	int convert = 1;
	mem_sync_t ms;
	memquery_t mq;
	memhandle_t mh;
	struct memlist *ml;
	struct memlist *d_ml = NULL;

	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);

	if (npgs == 0)
		return (DR_MEM_RES_OK);

	if ((rv = kphysm_del_gethandle(&mh)) != KPHYSM_OK) {
		cmn_err(CE_WARN, "%s: del_gethandle() = %d", __func__, rv);
		rv = cvt_err(rv);
		return (rv);
	}
	if ((rv = kphysm_del_span_query(base, npgs, &mq))
	    != KPHYSM_OK) {
		cmn_err(CE_WARN, "%s: del_span_query() = %d", __func__, rv);
		goto done;
	}
	if (mq.nonrelocatable) {
		DR_DBG_MEM("%s: non-reloc pages = %ld",
		    __func__, mq.nonrelocatable);
		rv  = KPHYSM_ENONRELOC;
		goto done;
	}
	if (rv = kcage_range_delete(base, npgs)) {
		switch (rv) {
		case EBUSY:
			rv = DR_MEM_RES_ENOTVIABLE;
			break;
		default:
			rv = DR_MEM_RES_FAILURE;
			break;
		}
		convert = 0; /* conversion done */
		cmn_err(CE_WARN, "%s: del_range() = %d", __func__, rv);
		goto done;
	} else {
		del_range++;
	}
	if ((rv = kphysm_del_span(mh, base, npgs)) != KPHYSM_OK) {
		cmn_err(CE_WARN, "%s: del_span() = %d", __func__, rv);
		goto done;
	}
	if ((rv = memlist_add_span(ptob(base), ptob(npgs), &d_ml))
	    != MEML_SPANOP_OK) {
		switch (rv) {
		case MEML_SPANOP_ESPAN:
			rv = DR_MEM_RES_ESPAN;
			break;
		case MEML_SPANOP_EALLOC:
			rv = DR_MEM_RES_ERESOURCE;
			break;
		default:
			rv = DR_MEM_RES_FAILURE;
			break;
		}
		convert = 0; /* conversion done */
		cmn_err(CE_WARN, "%s: add_span() = %d", __func__, rv);
		goto done;
	}

	DR_DBG_MEM("%s: reserved=0x%lx", __func__, npgs);

	bzero((void *) &ms, sizeof (ms));

	mutex_init(&ms.lock, NULL, MUTEX_DRIVER, NULL);
	cv_init(&ms.cond, NULL, CV_DRIVER, NULL);
	mutex_enter(&ms.lock);

	if ((rv = kphysm_del_start(mh, del_done, (void *) &ms)) == KPHYSM_OK) {
		/*
		 * Since we've called drctl_config_init, we are the only
		 * DR ctl operation in progress.  Set dr_mh to the
		 * delete memhandle for use by stat and cancel.
		 */
		ASSERT(dr_mh == NULL);
		dr_mh = mh;

		/*
		 * Wait for completion or interrupt.
		 */
		while (!ms.done) {
			if (cv_wait_sig(&ms.cond, &ms.lock) == 0) {
				/*
				 * There is a pending signal.
				 */
				(void) kphysm_del_cancel(mh);
				DR_DBG_MEM("%s: cancel", __func__);
				/*
				 * Wait for completion.
				 */
				while (!ms.done)
					cv_wait(&ms.cond, &ms.lock);
			}
		}
		dr_mh = NULL;
		rv = ms.error;
	} else {
		DR_DBG_MEM("%s: del_start() = %d", __func__, rv);
	}

	mutex_exit(&ms.lock);
	cv_destroy(&ms.cond);
	mutex_destroy(&ms.lock);

done:
	if (rv && del_range) {
		/*
		 * Add back the spans to the kcage growth list.
		 */
		for (ml = d_ml; ml; ml = ml->ml_next)
			if (err = kcage_range_add(btop(ml->ml_address),
			    btop(ml->ml_size), KCAGE_DOWN))
				cmn_err(CE_WARN, "kcage_range_add() = %d", err);
	}
	memlist_free_list(d_ml);

	if ((err = kphysm_del_release(mh)) != KPHYSM_OK)
		cmn_err(CE_WARN, "%s: del_release() = %d", __func__, err);
	if (convert)
		rv = cvt_err(rv);

	DR_DBG_MEM("%s: rv=%d", __func__, rv);

	return (rv);
}