xref: /titanic_51/usr/src/uts/sun4v/io/dr_mem.c (revision c5fb5d329e745a7b8cb9ff07eebb42948af7bc4e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * sun4v Memory DR Module
29  */
30 
31 
32 #include <sys/types.h>
33 #include <sys/cmn_err.h>
34 #include <sys/vmem.h>
35 #include <sys/kmem.h>
36 #include <sys/systm.h>
37 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
38 #include <sys/errno.h>
39 #include <sys/memnode.h>
40 #include <sys/memlist.h>
41 #include <sys/memlist_impl.h>
42 #include <sys/tuneable.h>
43 #include <sys/proc.h>
44 #include <sys/disp.h>
45 #include <sys/debug.h>
46 #include <sys/vm.h>
47 #include <sys/callb.h>
48 #include <sys/memlist_plat.h>	/* for installed_top_size() */
49 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
50 #include <sys/dumphdr.h>	/* for dump_resize() */
51 #include <sys/atomic.h>		/* for use in stats collection */
52 #include <sys/rwlock.h>
53 #include <vm/seg_kmem.h>
54 #include <vm/seg_kpm.h>
55 #include <vm/page.h>
56 #include <vm/vm_dep.h>
57 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
58 #include <sys/sunddi.h>
59 #include <sys/mem_config.h>
60 #include <sys/mem_cage.h>
61 #include <sys/lgrp.h>
62 #include <sys/ddi.h>
63 
64 #include <sys/modctl.h>
65 #include <sys/sysevent/dr.h>
66 #include <sys/mach_descrip.h>
67 #include <sys/mdesc.h>
68 #include <sys/ds.h>
69 #include <sys/drctl.h>
70 #include <sys/dr_util.h>
71 #include <sys/dr_mem.h>
72 
73 
74 /*
75  * DR operations are subject to Memory Alignment restrictions
76  * for both address and the size of the request.
77  */
78 #define	MA_ADDR	0x10000000	/* addr alignment 256M */
79 #define	MA_SIZE	0x10000000	/* size alignment 256M */
80 
81 #define	MBLK_IS_VALID(m) \
82 	(IS_P2ALIGNED((m)->addr, MA_ADDR) && IS_P2ALIGNED((m)->size, MA_SIZE))
83 
84 static memhandle_t dr_mh;	/* memory handle for delete */
85 
86 static struct modlmisc modlmisc = {
87 	&mod_miscops,
88 	"sun4v memory DR"
89 };
90 
91 static struct modlinkage modlinkage = {
92 	MODREV_1,
93 	(void *)&modlmisc,
94 	NULL
95 };
96 
97 static int dr_mem_allow_unload = 0;
98 
99 typedef int (*fn_t)(dr_mem_blk_t *, int *);
100 
101 /*
102  * Global Domain Services (DS) Handle
103  */
104 static ds_svc_hdl_t ds_handle;
105 
106 /*
107  * Supported DS Capability Versions
108  */
109 static ds_ver_t		dr_mem_vers[] = { { 1, 0 } };
110 #define	DR_MEM_NVERS	(sizeof (dr_mem_vers) / sizeof (dr_mem_vers[0]))
111 
112 /*
113  * DS Capability Description
114  */
115 static ds_capability_t dr_mem_cap = {
116 	DR_MEM_DS_ID,		/* svc_id */
117 	dr_mem_vers,		/* vers */
118 	DR_MEM_NVERS		/* nvers */
119 };
120 
121 /*
122  * DS Callbacks
123  */
124 static void dr_mem_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
125 static void dr_mem_unreg_handler(ds_cb_arg_t arg);
126 static void dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
127 
128 /*
129  * DS Client Ops Vector
130  */
131 static ds_clnt_ops_t dr_mem_ops = {
132 	dr_mem_reg_handler,	/* ds_reg_cb */
133 	dr_mem_unreg_handler,	/* ds_unreg_cb */
134 	dr_mem_data_handler,	/* ds_data_cb */
135 	NULL			/* cb_arg */
136 };
137 
138 /*
139  * Operation Results
140  *
141  * Used internally to gather results while an operation on a
142  * list of mblks is in progress. In particular, it is used to
143  * keep track of which mblks have already failed so that they are
144  * not processed further, and the manner in which they failed.
145  */
146 typedef struct {
147 	uint64_t	addr;
148 	uint64_t	size;
149 	uint32_t	result;
150 	uint32_t	status;
151 	char		*string;
152 } dr_mem_res_t;
153 
154 static char *
155 dr_mem_estr[] = {
156 	"operation succeeded",		/* DR_MEM_RES_OK */
157 	"operation failed",		/* DR_MEM_RES_FAILURE */
158 	"operation was blocked",	/* DR_MEM_RES_BLOCKED */
159 	"memory not defined in MD",	/* DR_MEM_RES_NOT_IN_MD */
160 	"memory already in use",	/* DR_MEM_RES_ESPAN */
161 	"memory access test failed",	/* DR_MEM_RES_EFAULT */
162 	"resource not available",	/* DR_MEM_RES_ERESOURCE */
163 	"permanent pages in span",	/* DR_MEM_RES_PERM */
164 	"memory span busy",		/* DR_MEM_RES_EBUSY */
165 	"VM viability test failed",	/* DR_MEM_RES_ENOTVIABLE */
166 	"no pages to unconfigure",	/* DR_MEM_RES_ENOWORK */
167 	"operation cancelled",		/* DR_MEM_RES_ECANCELLED */
168 	"operation refused",		/* DR_MEM_RES_EREFUSED */
169 	"memory span duplicate",	/* DR_MEM_RES_EDUP */
170 	"invalid argument"		/* DR_MEM_RES_EINVAL */
171 };
172 
173 typedef struct {
174 	kcondvar_t cond;
175 	kmutex_t lock;
176 	int error;
177 	int done;
178 } mem_sync_t;
179 
180 /*
181  * Internal Functions
182  */
183 static int dr_mem_init(void);
184 static int dr_mem_fini(void);
185 
186 static int dr_mem_list_wrk(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
187 static int dr_mem_list_query(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
188 static int dr_mem_del_stat(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
189 static int dr_mem_del_cancel(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
190 
191 static int dr_mem_unconfigure(dr_mem_blk_t *, int *);
192 static int dr_mem_configure(dr_mem_blk_t *, int *);
193 static void dr_mem_query(dr_mem_blk_t *, dr_mem_query_t *);
194 
195 static dr_mem_res_t *dr_mem_res_array_init(dr_mem_hdr_t *, drctl_rsrc_t *, int);
196 static void dr_mem_res_array_fini(dr_mem_res_t *res, int nres);
197 static size_t dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res,
198     dr_mem_hdr_t **respp);
199 
200 static int dr_mem_find(dr_mem_blk_t *mbp);
201 static mde_cookie_t dr_mem_find_node_md(dr_mem_blk_t *, md_t *, mde_cookie_t *);
202 
203 static int mem_add(pfn_t, pgcnt_t);
204 static int mem_del(pfn_t, pgcnt_t);
205 
206 extern int kphysm_add_memory_dynamic(pfn_t, pgcnt_t);
207 
208 int
209 _init(void)
210 {
211 	int	status;
212 
213 	/* check that Memory DR is enabled */
214 	if (dr_is_disabled(DR_TYPE_MEM))
215 		return (ENOTSUP);
216 
217 	if ((status = dr_mem_init()) != 0) {
218 		cmn_err(CE_NOTE, "Memory DR initialization failed");
219 		return (status);
220 	}
221 
222 	if ((status = mod_install(&modlinkage)) != 0) {
223 		(void) dr_mem_fini();
224 	}
225 
226 	return (status);
227 }
228 
229 int
230 _info(struct modinfo *modinfop)
231 {
232 	return (mod_info(&modlinkage, modinfop));
233 }
234 
235 int
236 _fini(void)
237 {
238 	int	status;
239 
240 	if (dr_mem_allow_unload == 0)
241 		return (EBUSY);
242 
243 	if ((status = mod_remove(&modlinkage)) == 0) {
244 		(void) dr_mem_fini();
245 	}
246 
247 	return (status);
248 }
249 
250 static int
251 dr_mem_init(void)
252 {
253 	int rv;
254 
255 	if ((rv = ds_cap_init(&dr_mem_cap, &dr_mem_ops)) != 0) {
256 		cmn_err(CE_NOTE, "dr_mem: ds_cap_init failed: %d", rv);
257 		return (rv);
258 	}
259 
260 	return (0);
261 }
262 
263 static int
264 dr_mem_fini(void)
265 {
266 	int rv;
267 
268 	if ((rv = ds_cap_fini(&dr_mem_cap)) != 0) {
269 		cmn_err(CE_NOTE, "dr_mem: ds_cap_fini failed: %d", rv);
270 	}
271 
272 	return (rv);
273 }
274 
275 static void
276 dr_mem_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
277 {
278 	DR_DBG_MEM("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg,
279 	    ver->major, ver->minor, hdl);
280 
281 	ds_handle = hdl;
282 }
283 
284 static void
285 dr_mem_unreg_handler(ds_cb_arg_t arg)
286 {
287 	DR_DBG_MEM("unreg_handler: arg=0x%p\n", arg);
288 
289 	ds_handle = DS_INVALID_HDL;
290 }
291 
292 /*ARGSUSED*/
293 static void
294 dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
295 {
296 	dr_mem_hdr_t	*req = buf;
297 	dr_mem_hdr_t	err_resp;
298 	dr_mem_hdr_t	*resp = &err_resp;
299 	int		resp_len = 0;
300 	int		rv = EINVAL;
301 
302 	/*
303 	 * Sanity check the message
304 	 */
305 	if (buflen < sizeof (dr_mem_hdr_t)) {
306 		DR_DBG_MEM("incoming message short: expected at least %ld "
307 		    "bytes, received %ld\n", sizeof (dr_mem_hdr_t), buflen);
308 		goto done;
309 	}
310 
311 	if (req == NULL) {
312 		DR_DBG_MEM("empty message: expected at least %ld bytes\n",
313 		    sizeof (dr_mem_hdr_t));
314 		goto done;
315 	}
316 
317 	DR_DBG_MEM("incoming request:\n");
318 	DR_DBG_DUMP_MSG(buf, buflen);
319 
320 	/*
321 	 * Process the command
322 	 */
323 	switch (req->msg_type) {
324 	case DR_MEM_CONFIGURE:
325 	case DR_MEM_UNCONFIGURE:
326 		if (req->msg_arg == 0) {
327 			DR_DBG_MEM("No mblks specified for operation\n");
328 			goto done;
329 		}
330 		if ((rv = dr_mem_list_wrk(req, &resp, &resp_len)) != 0) {
331 			DR_DBG_MEM("%s failed (%d)\n",
332 			    (req->msg_type == DR_MEM_CONFIGURE) ?
333 			    "Memory configure" : "Memory unconfigure", rv);
334 		}
335 		break;
336 
337 	case DR_MEM_UNCONF_STATUS:
338 		if ((rv = dr_mem_del_stat(req, &resp, &resp_len)) != 0)
339 			DR_DBG_MEM("Memory delete status failed (%d)\n", rv);
340 		break;
341 
342 	case DR_MEM_UNCONF_CANCEL:
343 		if ((rv = dr_mem_del_cancel(req, &resp, &resp_len)) != 0)
344 			DR_DBG_MEM("Memory delete cancel failed (%d)\n", rv);
345 		break;
346 
347 	case DR_MEM_QUERY:
348 		if (req->msg_arg == 0) {
349 			DR_DBG_MEM("No mblks specified for operation\n");
350 			goto done;
351 		}
352 		if ((rv = dr_mem_list_query(req, &resp, &resp_len)) != 0)
353 			DR_DBG_MEM("Memory query failed (%d)\n", rv);
354 		break;
355 
356 	default:
357 		cmn_err(CE_NOTE, "unsupported memory DR operation (%d)",
358 		    req->msg_type);
359 		break;
360 	}
361 
362 done:
363 	/* check if an error occurred */
364 	if (resp == &err_resp) {
365 		resp->req_num = (req) ? req->req_num : 0;
366 		resp->msg_type = DR_MEM_ERROR;
367 		resp->msg_arg = rv;
368 		resp_len = sizeof (dr_mem_hdr_t);
369 	}
370 
371 	DR_DBG_MEM("outgoing response:\n");
372 	DR_DBG_DUMP_MSG(resp, resp_len);
373 
374 	/* send back the response */
375 	if (ds_cap_send(ds_handle, resp, resp_len) != 0) {
376 		DR_DBG_MEM("ds_send failed\n");
377 	}
378 
379 	/* free any allocated memory */
380 	if (resp != &err_resp) {
381 		kmem_free(resp, resp_len);
382 	}
383 }
384 
385 /*
386  * Common routine to config or unconfig multiple mblks.
387  *
388  * Note: Do not modify result buffer or length on error.
389  */
390 static int
391 dr_mem_list_wrk(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
392 {
393 	int		rv;
394 	int		idx;
395 	int		count;
396 	int		result;
397 	int		status;
398 	fn_t		dr_fn;
399 	int		se_hint;
400 	dr_mem_blk_t	*req_mblks;
401 	dr_mem_res_t	*res;
402 	int		drctl_cmd;
403 	int		drctl_flags = 0;
404 	drctl_rsrc_t	*drctl_req;
405 	size_t		drctl_req_len;
406 	drctl_resp_t	*drctl_resp;
407 	drctl_rsrc_t	*drctl_rsrc;
408 	size_t		drctl_resp_len = 0;
409 	drctl_cookie_t	drctl_res_ck;
410 
411 	ASSERT((req != NULL) && (req->msg_arg != 0));
412 
413 	count = req->msg_arg;
414 
415 	/*
416 	 * Extract all information that is specific
417 	 * to the various types of operations.
418 	 */
419 	switch (req->msg_type) {
420 	case DR_MEM_CONFIGURE:
421 		dr_fn = dr_mem_configure;
422 		drctl_cmd = DRCTL_MEM_CONFIG_REQUEST;
423 		se_hint = SE_HINT_INSERT;
424 		break;
425 	case DR_MEM_UNCONFIGURE:
426 		dr_fn = dr_mem_unconfigure;
427 		drctl_cmd = DRCTL_MEM_UNCONFIG_REQUEST;
428 		se_hint = SE_HINT_REMOVE;
429 		break;
430 	default:
431 		/* Programming error if we reach this. */
432 		cmn_err(CE_NOTE, "%s: bad msg_type %d\n",
433 		    __func__, req->msg_type);
434 		ASSERT(0);
435 		return (-1);
436 	}
437 
438 	/* the incoming array of mblks to operate on */
439 	req_mblks = DR_MEM_CMD_MBLKS(req);
440 
441 	/* allocate drctl request msg based on incoming resource count */
442 	drctl_req_len = sizeof (drctl_rsrc_t) * count;
443 	drctl_req = kmem_zalloc(drctl_req_len, KM_SLEEP);
444 
445 	/* copy the size for the drctl call from the incoming request msg */
446 	for (idx = 0; idx < count; idx++) {
447 		drctl_req[idx].res_mem_addr = req_mblks[idx].addr;
448 		drctl_req[idx].res_mem_size = req_mblks[idx].size;
449 	}
450 
451 	rv = drctl_config_init(drctl_cmd, drctl_flags, drctl_req,
452 	    count, &drctl_resp, &drctl_resp_len, &drctl_res_ck);
453 
454 	ASSERT((drctl_resp != NULL) && (drctl_resp_len != 0));
455 
456 	if (rv != 0) {
457 		DR_DBG_MEM("%s: drctl_config_init returned: %d\n",
458 		    __func__, rv);
459 		kmem_free(drctl_resp, drctl_resp_len);
460 		kmem_free(drctl_req, drctl_req_len);
461 		return (rv);
462 	}
463 
464 	ASSERT(drctl_resp->resp_type == DRCTL_RESP_OK);
465 
466 	drctl_rsrc = drctl_resp->resp_resources;
467 
468 	/* create the result scratch array */
469 	res = dr_mem_res_array_init(req, drctl_rsrc, count);
470 
471 	/* perform the specified operation on each of the mblks */
472 	for (idx = 0; idx < count; idx++) {
473 		/*
474 		 * If no action will be taken against the current
475 		 * mblk, update the drctl resource information to
476 		 * ensure that it gets recovered properly during
477 		 * the drctl fini() call.
478 		 */
479 		if (res[idx].result != DR_MEM_RES_OK) {
480 			drctl_req[idx].status = DRCTL_STATUS_CONFIG_FAILURE;
481 			continue;
482 		}
483 
484 		/* call the function to perform the actual operation */
485 		result = (*dr_fn)(&req_mblks[idx], &status);
486 
487 		/* save off results of the operation */
488 		res[idx].result = result;
489 		res[idx].status = status;
490 		res[idx].addr = req_mblks[idx].addr;	/* for partial case */
491 		res[idx].size = req_mblks[idx].size;	/* for partial case */
492 		res[idx].string = i_ddi_strdup(dr_mem_estr[result], KM_SLEEP);
493 
494 		/* save result for drctl fini() reusing init() msg memory */
495 		drctl_req[idx].status = (result != DR_MEM_RES_OK) ?
496 		    DRCTL_STATUS_CONFIG_FAILURE : DRCTL_STATUS_CONFIG_SUCCESS;
497 
498 		DR_DBG_MEM("%s: mblk 0x%lx.0x%lx stat %d result %d off '%s'\n",
499 		    __func__, req_mblks[idx].addr, req_mblks[idx].size,
500 		    drctl_req[idx].status, result,
501 		    (res[idx].string) ? res[idx].string : "");
502 	}
503 
504 	if ((rv = drctl_config_fini(&drctl_res_ck, drctl_req, count)) != 0)
505 		DR_DBG_MEM("%s: drctl_config_fini returned: %d\n",
506 		    __func__, rv);
507 
508 	/*
509 	 * Operation completed without any fatal errors.
510 	 * Pack the response for transmission.
511 	 */
512 	*resp_len = dr_mem_pack_response(req, res, resp);
513 
514 	/* notify interested parties about the operation */
515 	dr_generate_event(DR_TYPE_MEM, se_hint);
516 
517 	/*
518 	 * Deallocate any scratch memory.
519 	 */
520 	kmem_free(drctl_resp, drctl_resp_len);
521 	kmem_free(drctl_req, drctl_req_len);
522 
523 	dr_mem_res_array_fini(res, count);
524 
525 	return (0);
526 }
527 
528 /*
529  * Allocate and initialize a result array based on the initial
530  * drctl operation. A valid result array is always returned.
531  */
532 static dr_mem_res_t *
533 dr_mem_res_array_init(dr_mem_hdr_t *req, drctl_rsrc_t *rsrc, int nrsrc)
534 {
535 	int		idx;
536 	dr_mem_res_t	*res;
537 	char		*err_str;
538 	size_t		err_len;
539 
540 	/* allocate zero filled buffer to initialize fields */
541 	res = kmem_zalloc(nrsrc * sizeof (dr_mem_res_t), KM_SLEEP);
542 
543 	/*
544 	 * Fill in the result information for each resource.
545 	 */
546 	for (idx = 0; idx < nrsrc; idx++) {
547 		res[idx].addr = rsrc[idx].res_mem_addr;
548 		res[idx].size = rsrc[idx].res_mem_size;
549 		res[idx].result = DR_MEM_RES_OK;
550 
551 		if (rsrc[idx].status == DRCTL_STATUS_ALLOW)
552 			continue;
553 
554 		/*
555 		 * Update the state information for this mblk.
556 		 */
557 		res[idx].result = DR_MEM_RES_BLOCKED;
558 		res[idx].status = (req->msg_type == DR_MEM_CONFIGURE) ?
559 		    DR_MEM_STAT_UNCONFIGURED : DR_MEM_STAT_CONFIGURED;
560 
561 		/*
562 		 * If an error string exists, copy it out of the
563 		 * message buffer. This eliminates any dependency
564 		 * on the memory allocated for the message buffer
565 		 * itself.
566 		 */
567 		if (rsrc[idx].offset != NULL) {
568 			err_str = (char *)rsrc + rsrc[idx].offset;
569 			err_len = strlen(err_str) + 1;
570 
571 			res[idx].string = kmem_alloc(err_len, KM_SLEEP);
572 			bcopy(err_str, res[idx].string, err_len);
573 		}
574 	}
575 
576 	return (res);
577 }
578 
579 static void
580 dr_mem_res_array_fini(dr_mem_res_t *res, int nres)
581 {
582 	int	idx;
583 	size_t	str_len;
584 
585 	for (idx = 0; idx < nres; idx++) {
586 		/* deallocate the error string if present */
587 		if (res[idx].string) {
588 			str_len = strlen(res[idx].string) + 1;
589 			kmem_free(res[idx].string, str_len);
590 		}
591 	}
592 
593 	/* deallocate the result array itself */
594 	kmem_free(res, sizeof (dr_mem_res_t) * nres);
595 }
596 
597 /*
598  * Allocate and pack a response message for transmission based
599  * on the specified result array. A valid response message and
600  * valid size information is always returned.
601  */
602 static size_t
603 dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res, dr_mem_hdr_t **respp)
604 {
605 	int		idx;
606 	dr_mem_hdr_t	*resp;
607 	dr_mem_stat_t	*resp_stat;
608 	size_t		resp_len;
609 	uint32_t	curr_off;
610 	caddr_t		curr_str;
611 	size_t		str_len;
612 	size_t		stat_len;
613 	int		nstat = req->msg_arg;
614 
615 	/*
616 	 * Calculate the size of the response message
617 	 * and allocate an appropriately sized buffer.
618 	 */
619 	resp_len = sizeof (dr_mem_hdr_t);
620 
621 	/* add the stat array size */
622 	stat_len = sizeof (dr_mem_stat_t) * nstat;
623 	resp_len += stat_len;
624 
625 	/* add the size of any error strings */
626 	for (idx = 0; idx < nstat; idx++) {
627 		if (res[idx].string != NULL) {
628 			resp_len += strlen(res[idx].string) + 1;
629 		}
630 	}
631 
632 	/* allocate the message buffer */
633 	resp = kmem_zalloc(resp_len, KM_SLEEP);
634 
635 	/*
636 	 * Fill in the header information.
637 	 */
638 	resp->req_num = req->req_num;
639 	resp->msg_type = DR_MEM_OK;
640 	resp->msg_arg = nstat;
641 
642 	/*
643 	 * Fill in the stat information.
644 	 */
645 	resp_stat = DR_MEM_RESP_STATS(resp);
646 
647 	/* string offsets start immediately after stat array */
648 	curr_off = sizeof (dr_mem_hdr_t) + stat_len;
649 	curr_str = (char *)resp_stat + stat_len;
650 
651 	for (idx = 0; idx < nstat; idx++) {
652 		resp_stat[idx].addr = res[idx].addr;
653 		resp_stat[idx].size = res[idx].size;
654 		resp_stat[idx].result = res[idx].result;
655 		resp_stat[idx].status = res[idx].status;
656 
657 		if (res[idx].string != NULL) {
658 			/* copy over the error string */
659 			str_len = strlen(res[idx].string) + 1;
660 			bcopy(res[idx].string, curr_str, str_len);
661 			resp_stat[idx].string_off = curr_off;
662 
663 			curr_off += str_len;
664 			curr_str += str_len;
665 		}
666 	}
667 
668 	/* buffer should be exactly filled */
669 	ASSERT(curr_off == resp_len);
670 
671 	*respp = resp;
672 	return (resp_len);
673 }
674 
675 static void
676 dr_mem_query(dr_mem_blk_t *mbp, dr_mem_query_t *mqp)
677 {
678 	memquery_t mq;
679 
680 	DR_DBG_MEM("dr_mem_query...\n");
681 
682 
683 	(void) kphysm_del_span_query(btop(mbp->addr), btop(mbp->size), &mq);
684 
685 	if (!mq.phys_pages)
686 		return;
687 
688 	mqp->addr = mbp->addr;
689 	mqp->mq.phys_pages = ptob(mq.phys_pages);
690 	mqp->mq.managed = ptob(mq.managed);
691 	mqp->mq.nonrelocatable = ptob(mq.nonrelocatable);
692 	mqp->mq.first_nonrelocatable = ptob(mq.first_nonrelocatable);
693 	mqp->mq.last_nonrelocatable = ptob(mq.last_nonrelocatable);
694 	/*
695 	 * Set to the max byte offset within the page.
696 	 */
697 	if (mqp->mq.nonrelocatable)
698 		mqp->mq.last_nonrelocatable += PAGESIZE - 1;
699 }
700 
701 /*
702  * Do not modify result buffer or length on error.
703  */
704 static int
705 dr_mem_list_query(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
706 {
707 	int		idx;
708 	int		rlen;
709 	int		nml;
710 	struct memlist	*ml;
711 	dr_mem_blk_t	*req_mblks, mb;
712 	dr_mem_hdr_t	*rp;
713 	dr_mem_query_t	*stat;
714 
715 	drctl_block();
716 
717 	/* the incoming array of req_mblks to configure */
718 	req_mblks = DR_MEM_CMD_MBLKS(req);
719 
720 	/* allocate a response message, should be freed by caller */
721 	nml = 0;
722 	rlen = sizeof (dr_mem_hdr_t);
723 	if (req_mblks->addr == NULL && req_mblks->size == 0) {
724 		/*
725 		 * Request is for domain's full view of it's memory.
726 		 */
727 		memlist_read_lock();
728 		for (ml = phys_install; ml; ml = ml->ml_next)
729 			nml++;
730 
731 		rlen += nml * sizeof (dr_mem_query_t);
732 	} else {
733 		rlen += req->msg_arg * sizeof (dr_mem_query_t);
734 	}
735 	rp = kmem_zalloc(rlen, KM_SLEEP);
736 
737 	/* fill in the known data */
738 	rp->req_num = req->req_num;
739 	rp->msg_type = DR_MEM_OK;
740 	rp->msg_arg = nml ? nml : req->msg_arg;
741 
742 	/* stat array for the response */
743 	stat = DR_MEM_RESP_QUERY(rp);
744 
745 	/* get the status for each of the mblocks */
746 	if (nml) {
747 		for (idx = 0, ml = phys_install; ml; ml = ml->ml_next, idx++) {
748 			mb.addr = ml->ml_address;
749 			mb.size = ml->ml_size;
750 			dr_mem_query(&mb, &stat[idx]);
751 		}
752 		memlist_read_unlock();
753 	} else {
754 		for (idx = 0; idx < req->msg_arg; idx++)
755 			dr_mem_query(&req_mblks[idx], &stat[idx]);
756 	}
757 
758 	*resp = rp;
759 	*resp_len = rlen;
760 
761 	drctl_unblock();
762 
763 	return (0);
764 }
765 
766 static int
767 cvt_err(int err)
768 {
769 	int rv;
770 
771 	switch (err) {
772 	case KPHYSM_OK:
773 		rv = DR_MEM_RES_OK;
774 		break;
775 	case KPHYSM_ESPAN:
776 		rv = DR_MEM_RES_ESPAN;
777 		break;
778 	case KPHYSM_EFAULT:
779 		rv = DR_MEM_RES_EFAULT;
780 		break;
781 	case KPHYSM_ERESOURCE:
782 		rv = DR_MEM_RES_ERESOURCE;
783 		break;
784 	case KPHYSM_ENOTSUP:
785 	case KPHYSM_ENOHANDLES:
786 		rv = DR_MEM_RES_FAILURE;
787 		break;
788 	case KPHYSM_ENONRELOC:
789 		rv = DR_MEM_RES_PERM;
790 		break;
791 	case KPHYSM_EHANDLE:
792 		rv = DR_MEM_RES_FAILURE;
793 		break;
794 	case KPHYSM_EBUSY:
795 		rv = DR_MEM_RES_EBUSY;
796 		break;
797 	case KPHYSM_ENOTVIABLE:
798 		rv = DR_MEM_RES_ENOTVIABLE;
799 		break;
800 	case KPHYSM_ESEQUENCE:
801 		rv = DR_MEM_RES_FAILURE;
802 		break;
803 	case KPHYSM_ENOWORK:
804 		rv = DR_MEM_RES_ENOWORK;
805 		break;
806 	case KPHYSM_ECANCELLED:
807 		rv = DR_MEM_RES_ECANCELLED;
808 		break;
809 	case KPHYSM_EREFUSED:
810 		rv = DR_MEM_RES_EREFUSED;
811 		break;
812 	case KPHYSM_ENOTFINISHED:
813 	case KPHYSM_ENOTRUNNING:
814 		rv = DR_MEM_RES_FAILURE;
815 		break;
816 	case KPHYSM_EDUP:
817 		rv = DR_MEM_RES_EDUP;
818 		break;
819 	default:
820 		rv = DR_MEM_RES_FAILURE;
821 		break;
822 	}
823 
824 	return (rv);
825 }
826 
827 static int
828 dr_mem_configure(dr_mem_blk_t *mbp, int *status)
829 {
830 	int rv;
831 	uint64_t addr, size;
832 
833 	rv = 0;
834 	addr = mbp->addr;
835 	size = mbp->size;
836 
837 	DR_DBG_MEM("dr_mem_configure...\n");
838 
839 	if (!MBLK_IS_VALID(mbp)) {
840 		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n", addr, size);
841 		*status = DR_MEM_STAT_UNCONFIGURED;
842 		rv = DR_MEM_RES_EINVAL;
843 	} else if (rv = dr_mem_find(mbp)) {
844 		DR_DBG_MEM("failed to find mblk 0x%lx.0x%lx (%d)\n",
845 		    addr, size, rv);
846 		if (rv == EINVAL) {
847 			*status = DR_MEM_STAT_NOT_PRESENT;
848 			rv = DR_MEM_RES_NOT_IN_MD;
849 		} else {
850 			*status = DR_MEM_STAT_UNCONFIGURED;
851 			rv = DR_MEM_RES_FAILURE;
852 		}
853 	} else {
854 		rv = mem_add(btop(addr), btop(size));
855 		DR_DBG_MEM("addr=0x%lx size=0x%lx rv=%d\n", addr, size, rv);
856 		if (rv) {
857 			*status = DR_MEM_STAT_UNCONFIGURED;
858 		} else {
859 			*status = DR_MEM_STAT_CONFIGURED;
860 		}
861 	}
862 
863 	return (rv);
864 }
865 
866 static int
867 dr_mem_unconfigure(dr_mem_blk_t *mbp, int *status)
868 {
869 	int rv;
870 
871 	DR_DBG_MEM("dr_mem_unconfigure...\n");
872 
873 	if (!MBLK_IS_VALID(mbp)) {
874 		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n",
875 		    mbp->addr, mbp->size);
876 			*status = DR_MEM_STAT_CONFIGURED;
877 			rv = DR_MEM_RES_EINVAL;
878 	} else if (rv = mem_del(btop(mbp->addr), btop(mbp->size))) {
879 		*status = DR_MEM_STAT_CONFIGURED;
880 	} else {
881 		*status = DR_MEM_STAT_UNCONFIGURED;
882 		rv = DR_MEM_RES_OK;
883 		DR_DBG_MEM("mblk 0x%lx.0x%lx unconfigured\n",
884 		    mbp->addr, mbp->size);
885 	}
886 	return (rv);
887 }
888 
889 static int
890 dr_mem_del_stat(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
891 {
892 	int			status;
893 	int			rlen;
894 	memdelstat_t		del_stat, *stat;
895 	dr_mem_hdr_t		*rp;
896 
897 	/*
898 	 * If a mem delete is in progress, get its status.
899 	 */
900 	status = (dr_mh && (kphysm_del_status(dr_mh, &del_stat) == KPHYSM_OK));
901 
902 	/* allocate a response message, should be freed by caller */
903 	rlen = sizeof (dr_mem_hdr_t);
904 	rlen += status * sizeof (memdelstat_t);
905 	rp = kmem_zalloc(rlen, KM_SLEEP);
906 
907 	/* fill in the known data */
908 	rp->req_num = req->req_num;
909 	rp->msg_type = DR_MEM_OK;
910 	rp->msg_arg = status;
911 
912 	if (status) {
913 		/* stat struct for the response */
914 		stat = DR_MEM_RESP_DEL_STAT(rp);
915 		stat->phys_pages = ptob(del_stat.phys_pages);
916 		stat->managed = ptob(del_stat.managed);
917 		stat->collected = ptob(del_stat.collected);
918 	}
919 
920 	*resp = rp;
921 	*resp_len = rlen;
922 
923 	return (0);
924 }
925 
926 static int
927 dr_mem_del_cancel(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
928 {
929 	int		rlen;
930 	dr_mem_hdr_t	*rp;
931 
932 	/* allocate a response message, should be freed by caller */
933 	rlen = sizeof (dr_mem_hdr_t);
934 	rp = kmem_zalloc(rlen, KM_SLEEP);
935 
936 	/* fill in the known data */
937 	rp->req_num = req->req_num;
938 	rp->msg_type = DR_MEM_OK;
939 	rp->msg_arg = (dr_mh && kphysm_del_cancel(dr_mh) != KPHYSM_OK) ?
940 	    DR_MEM_RES_EINVAL : DR_MEM_RES_OK;
941 
942 	*resp = rp;
943 	*resp_len = rlen;
944 
945 	return (0);
946 }
947 
948 static int
949 dr_mem_find(dr_mem_blk_t *mbp)
950 {
951 	md_t		*mdp = NULL;
952 	int		num_nodes;
953 	int		rv = 0;
954 	int		listsz;
955 	mde_cookie_t	*listp = NULL;
956 	mde_cookie_t	memnode;
957 	char		*found = "found";
958 
959 	if ((mdp = md_get_handle()) == NULL) {
960 		DR_DBG_MEM("unable to initialize machine description\n");
961 		return (-1);
962 	}
963 
964 	num_nodes = md_node_count(mdp);
965 	ASSERT(num_nodes > 0);
966 
967 	listsz = num_nodes * sizeof (mde_cookie_t);
968 	listp = kmem_zalloc(listsz, KM_SLEEP);
969 
970 	memnode = dr_mem_find_node_md(mbp, mdp, listp);
971 
972 	if (memnode == MDE_INVAL_ELEM_COOKIE) {
973 		rv = EINVAL;
974 		found = "not found";
975 	}
976 
977 	DR_DBG_MEM("mblk 0x%lx.0x%lx %s\n", mbp->addr, mbp->size, found);
978 
979 	kmem_free(listp, listsz);
980 	(void) md_fini_handle(mdp);
981 
982 	return (rv);
983 }
984 
985 /*
986  * Look up a particular mblk in the MD. Returns the mde_cookie_t
987  * representing that mblk if present, and MDE_INVAL_ELEM_COOKIE
988  * otherwise. It is assumed the scratch array has already been
989  * allocated so that it can accommodate the worst case scenario,
990  * every node in the MD.
991  */
992 static mde_cookie_t
993 dr_mem_find_node_md(dr_mem_blk_t *mbp, md_t *mdp, mde_cookie_t *listp)
994 {
995 	int		idx;
996 	int		nnodes;
997 	mde_cookie_t	rootnode;
998 	uint64_t	base_prop;
999 	uint64_t	size_prop;
1000 	mde_cookie_t	result = MDE_INVAL_ELEM_COOKIE;
1001 
1002 	rootnode = md_root_node(mdp);
1003 	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
1004 
1005 	/*
1006 	 * Scan the DAG for all the mem nodes
1007 	 */
1008 	nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "mblock"),
1009 	    md_find_name(mdp, "fwd"), listp);
1010 
1011 	if (nnodes < 0) {
1012 		DR_DBG_MEM("Scan for mblks failed\n");
1013 		return (result);
1014 	}
1015 
1016 	DR_DBG_MEM("dr_mem_find_node_md: found %d mblks in the MD\n", nnodes);
1017 
1018 	/*
1019 	 * Find the mblk of interest
1020 	 */
1021 	for (idx = 0; idx < nnodes; idx++) {
1022 
1023 		if (md_get_prop_val(mdp, listp[idx], "base", &base_prop)) {
1024 			DR_DBG_MEM("Missing 'base' property for mblk node %d\n",
1025 			    idx);
1026 			break;
1027 		}
1028 
1029 		if (md_get_prop_val(mdp, listp[idx], "size", &size_prop)) {
1030 			DR_DBG_MEM("Missing 'size' property for mblk node %d\n",
1031 			    idx);
1032 			break;
1033 		}
1034 
1035 		if (base_prop <= mbp->addr &&
1036 		    (base_prop + size_prop) >= (mbp->addr + mbp->size)) {
1037 			/* found a match */
1038 			DR_DBG_MEM("dr_mem_find_node_md: found mblk "
1039 			    "0x%lx.0x%lx in MD\n", mbp->addr, mbp->size);
1040 			result = listp[idx];
1041 			break;
1042 		}
1043 	}
1044 
1045 	if (result == MDE_INVAL_ELEM_COOKIE) {
1046 		DR_DBG_MEM("mblk 0x%lx.0x%lx not in MD\n",
1047 		    mbp->addr, mbp->size);
1048 	}
1049 
1050 	return (result);
1051 }
1052 
1053 static int
1054 mem_add(pfn_t base, pgcnt_t npgs)
1055 {
1056 	int rv, rc;
1057 
1058 	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);
1059 
1060 	if (npgs == 0)
1061 		return (DR_MEM_RES_OK);
1062 
1063 	rv = kphysm_add_memory_dynamic(base, npgs);
1064 	DR_DBG_MEM("%s: kphysm_add(0x%lx, 0x%lx) = %d", __func__, base, npgs,
1065 	    rv);
1066 	if (rv == KPHYSM_OK) {
1067 		if (rc = kcage_range_add(base, npgs, KCAGE_DOWN))
1068 			cmn_err(CE_WARN, "kcage_range_add() = %d", rc);
1069 	}
1070 	rv = cvt_err(rv);
1071 	return (rv);
1072 }
1073 
1074 static void
1075 del_done(void *arg, int error)
1076 {
1077 	mem_sync_t *ms = arg;
1078 
1079 	mutex_enter(&ms->lock);
1080 	ms->error = error;
1081 	ms->done = 1;
1082 	cv_signal(&ms->cond);
1083 	mutex_exit(&ms->lock);
1084 }
1085 
1086 static int
1087 mem_del(pfn_t base, pgcnt_t npgs)
1088 {
1089 	int rv, err, del_range = 0;
1090 	int convert = 1;
1091 	mem_sync_t ms;
1092 	memquery_t mq;
1093 	memhandle_t mh;
1094 	struct memlist *ml;
1095 	struct memlist *d_ml = NULL;
1096 
1097 	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);
1098 
1099 	if (npgs == 0)
1100 		return (DR_MEM_RES_OK);
1101 
1102 	if ((rv = kphysm_del_gethandle(&mh)) != KPHYSM_OK) {
1103 		cmn_err(CE_WARN, "%s: del_gethandle() = %d", __func__, rv);
1104 		rv = cvt_err(rv);
1105 		return (rv);
1106 	}
1107 	if ((rv = kphysm_del_span_query(base, npgs, &mq))
1108 	    != KPHYSM_OK) {
1109 		cmn_err(CE_WARN, "%s: del_span_query() = %d", __func__, rv);
1110 		goto done;
1111 	}
1112 	if (mq.nonrelocatable) {
1113 		DR_DBG_MEM("%s: non-reloc pages = %ld",
1114 		    __func__, mq.nonrelocatable);
1115 		rv  = KPHYSM_ENONRELOC;
1116 		goto done;
1117 	}
1118 	if (rv = kcage_range_delete(base, npgs)) {
1119 		switch (rv) {
1120 		case EBUSY:
1121 			rv = DR_MEM_RES_ENOTVIABLE;
1122 			break;
1123 		default:
1124 			rv = DR_MEM_RES_FAILURE;
1125 			break;
1126 		}
1127 		convert = 0; /* conversion done */
1128 		cmn_err(CE_WARN, "%s: del_range() = %d", __func__, rv);
1129 		goto done;
1130 	} else {
1131 		del_range++;
1132 	}
1133 	if ((rv = kphysm_del_span(mh, base, npgs)) != KPHYSM_OK) {
1134 		cmn_err(CE_WARN, "%s: del_span() = %d", __func__, rv);
1135 		goto done;
1136 	}
1137 	if ((rv = memlist_add_span(ptob(base), ptob(npgs), &d_ml))
1138 	    != MEML_SPANOP_OK) {
1139 		switch (rv) {
1140 		case MEML_SPANOP_ESPAN:
1141 			rv = DR_MEM_RES_ESPAN;
1142 			break;
1143 		case MEML_SPANOP_EALLOC:
1144 			rv = DR_MEM_RES_ERESOURCE;
1145 			break;
1146 		default:
1147 			rv = DR_MEM_RES_FAILURE;
1148 			break;
1149 		}
1150 		convert = 0; /* conversion done */
1151 		cmn_err(CE_WARN, "%s: add_span() = %d", __func__, rv);
1152 		goto done;
1153 	}
1154 
1155 	DR_DBG_MEM("%s: reserved=0x%lx", __func__, npgs);
1156 
1157 	bzero((void *) &ms, sizeof (ms));
1158 
1159 	mutex_init(&ms.lock, NULL, MUTEX_DRIVER, NULL);
1160 	cv_init(&ms.cond, NULL, CV_DRIVER, NULL);
1161 	mutex_enter(&ms.lock);
1162 
1163 	if ((rv = kphysm_del_start(mh, del_done, (void *) &ms)) == KPHYSM_OK) {
1164 		/*
1165 		 * Since we've called drctl_config_init, we are the only
1166 		 * DR ctl operation in progress.  Set dr_mh to the
1167 		 * delete memhandle for use by stat and cancel.
1168 		 */
1169 		ASSERT(dr_mh == NULL);
1170 		dr_mh = mh;
1171 
1172 		/*
1173 		 * Wait for completion or interrupt.
1174 		 */
1175 		while (!ms.done) {
1176 			if (cv_wait_sig(&ms.cond, &ms.lock) == 0) {
1177 				/*
1178 				 * There is a pending signal.
1179 				 */
1180 				(void) kphysm_del_cancel(mh);
1181 				DR_DBG_MEM("%s: cancel", __func__);
1182 				/*
1183 				 * Wait for completion.
1184 				 */
1185 				while (!ms.done)
1186 					cv_wait(&ms.cond, &ms.lock);
1187 			}
1188 		}
1189 		dr_mh = NULL;
1190 		rv = ms.error;
1191 	} else {
1192 		DR_DBG_MEM("%s: del_start() = %d", __func__, rv);
1193 	}
1194 
1195 	mutex_exit(&ms.lock);
1196 	cv_destroy(&ms.cond);
1197 	mutex_destroy(&ms.lock);
1198 
1199 done:
1200 	if (rv && del_range) {
1201 		/*
1202 		 * Add back the spans to the kcage growth list.
1203 		 */
1204 		for (ml = d_ml; ml; ml = ml->ml_next)
1205 			if (err = kcage_range_add(btop(ml->ml_address),
1206 			    btop(ml->ml_size), KCAGE_DOWN))
1207 				cmn_err(CE_WARN, "kcage_range_add() = %d", err);
1208 	}
1209 	memlist_free_list(d_ml);
1210 
1211 	if ((err = kphysm_del_release(mh)) != KPHYSM_OK)
1212 		cmn_err(CE_WARN, "%s: del_release() = %d", __func__, err);
1213 	if (convert)
1214 		rv = cvt_err(rv);
1215 
1216 	DR_DBG_MEM("%s: rv=%d", __func__, rv);
1217 
1218 	return (rv);
1219 }
1220