xref: /titanic_51/usr/src/uts/sun4v/io/dr_mem.c (revision 1bc9f3f58793fef9170c3afd7d01d0b5882c48e9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * sun4v Memory DR Module
29  */
30 
31 
32 #include <sys/types.h>
33 #include <sys/cmn_err.h>
34 #include <sys/vmem.h>
35 #include <sys/kmem.h>
36 #include <sys/systm.h>
37 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
38 #include <sys/errno.h>
39 #include <sys/memnode.h>
40 #include <sys/memlist.h>
41 #include <sys/memlist_impl.h>
42 #include <sys/tuneable.h>
43 #include <sys/proc.h>
44 #include <sys/disp.h>
45 #include <sys/debug.h>
46 #include <sys/vm.h>
47 #include <sys/callb.h>
48 #include <sys/memlist_plat.h>	/* for installed_top_size() */
49 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
50 #include <sys/dumphdr.h>	/* for dump_resize() */
51 #include <sys/atomic.h>		/* for use in stats collection */
52 #include <sys/rwlock.h>
53 #include <vm/seg_kmem.h>
54 #include <vm/seg_kpm.h>
55 #include <vm/page.h>
56 #include <vm/vm_dep.h>
57 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
58 #include <sys/sunddi.h>
59 #include <sys/mem_config.h>
60 #include <sys/mem_cage.h>
61 #include <sys/lgrp.h>
62 #include <sys/ddi.h>
63 
64 #include <sys/modctl.h>
65 #include <sys/sysevent/dr.h>
66 #include <sys/mach_descrip.h>
67 #include <sys/mdesc.h>
68 #include <sys/ds.h>
69 #include <sys/drctl.h>
70 #include <sys/dr_util.h>
71 #include <sys/dr_mem.h>
72 
73 
74 /*
75  * DR operations are subject to Memory Alignment restrictions
76  * for both address and the size of the request.
77  */
78 #define	MA_ADDR	0x10000000	/* addr alignment 256M */
79 #define	MA_SIZE	0x10000000	/* size alignment 256M */
80 
81 #define	MBLK_IS_VALID(m) \
82 	(IS_P2ALIGNED((m)->addr, MA_ADDR) && IS_P2ALIGNED((m)->size, MA_SIZE))
83 
84 static memhandle_t dr_mh;	/* memory handle for delete */
85 
86 static struct modlmisc modlmisc = {
87 	&mod_miscops,
88 	"sun4v memory DR"
89 };
90 
91 static struct modlinkage modlinkage = {
92 	MODREV_1,
93 	(void *)&modlmisc,
94 	NULL
95 };
96 
97 static int dr_mem_allow_unload = 0;
98 
99 typedef int (*fn_t)(dr_mem_blk_t *, int *);
100 
101 /*
102  * Global Domain Services (DS) Handle
103  */
104 static ds_svc_hdl_t ds_handle;
105 
106 /*
107  * Supported DS Capability Versions
108  */
109 static ds_ver_t		dr_mem_vers[] = { { 1, 0 } };
110 #define	DR_MEM_NVERS	(sizeof (dr_mem_vers) / sizeof (dr_mem_vers[0]))
111 
112 /*
113  * DS Capability Description
114  */
115 static ds_capability_t dr_mem_cap = {
116 	DR_MEM_DS_ID,		/* svc_id */
117 	dr_mem_vers,		/* vers */
118 	DR_MEM_NVERS		/* nvers */
119 };
120 
121 /*
122  * DS Callbacks
123  */
124 static void dr_mem_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
125 static void dr_mem_unreg_handler(ds_cb_arg_t arg);
126 static void dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
127 
128 /*
129  * DS Client Ops Vector
130  */
131 static ds_clnt_ops_t dr_mem_ops = {
132 	dr_mem_reg_handler,	/* ds_reg_cb */
133 	dr_mem_unreg_handler,	/* ds_unreg_cb */
134 	dr_mem_data_handler,	/* ds_data_cb */
135 	NULL			/* cb_arg */
136 };
137 
138 /*
139  * Operation Results
140  *
141  * Used internally to gather results while an operation on a
142  * list of mblks is in progress. In particular, it is used to
143  * keep track of which mblks have already failed so that they are
144  * not processed further, and the manner in which they failed.
145  */
146 typedef struct {
147 	uint64_t	addr;
148 	uint64_t	size;
149 	uint32_t	result;
150 	uint32_t	status;
151 	char		*string;
152 } dr_mem_res_t;
153 
154 static char *
155 dr_mem_estr[] = {
156 	"operation succeeded",		/* DR_MEM_RES_OK */
157 	"operation failed",		/* DR_MEM_RES_FAILURE */
158 	"operation was blocked",	/* DR_MEM_RES_BLOCKED */
159 	"memory not defined in MD",	/* DR_MEM_RES_NOT_IN_MD */
160 	"memory already in use",	/* DR_MEM_RES_ESPAN */
161 	"memory access test failed",	/* DR_MEM_RES_EFAULT */
162 	"resource not available",	/* DR_MEM_RES_ERESOURCE */
163 	"permanent pages in span",	/* DR_MEM_RES_PERM */
164 	"memory span busy",		/* DR_MEM_RES_EBUSY */
165 	"VM viability test failed",	/* DR_MEM_RES_ENOTVIABLE */
166 	"no pages to unconfigure",	/* DR_MEM_RES_ENOWORK */
167 	"operation cancelled",		/* DR_MEM_RES_ECANCELLED */
168 	"operation refused",		/* DR_MEM_RES_EREFUSED */
169 	"memory span duplicate",	/* DR_MEM_RES_EDUP */
170 	"invalid argument"		/* DR_MEM_RES_EINVAL */
171 };
172 
173 typedef struct {
174 	kcondvar_t cond;
175 	kmutex_t lock;
176 	int error;
177 	int done;
178 } mem_sync_t;
179 
180 /*
181  * Internal Functions
182  */
183 static int dr_mem_init(void);
184 static int dr_mem_fini(void);
185 
186 static int dr_mem_list_wrk(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
187 static int dr_mem_list_query(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
188 static int dr_mem_del_stat(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
189 static int dr_mem_del_cancel(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
190 
191 static int dr_mem_unconfigure(dr_mem_blk_t *, int *);
192 static int dr_mem_configure(dr_mem_blk_t *, int *);
193 static void dr_mem_query(dr_mem_blk_t *, dr_mem_query_t *);
194 
195 static dr_mem_res_t *dr_mem_res_array_init(dr_mem_hdr_t *, drctl_rsrc_t *, int);
196 static void dr_mem_res_array_fini(dr_mem_res_t *res, int nres);
197 static size_t dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res,
198     dr_mem_hdr_t **respp);
199 
200 static int dr_mem_find(dr_mem_blk_t *mbp);
201 static mde_cookie_t dr_mem_find_node_md(dr_mem_blk_t *, md_t *, mde_cookie_t *);
202 
203 static int mem_add(pfn_t, pgcnt_t);
204 static int mem_del(pfn_t, pgcnt_t);
205 
206 extern int kphysm_add_memory_dynamic(pfn_t, pgcnt_t);
207 
208 int
209 _init(void)
210 {
211 	int	status;
212 
213 	/* check that Memory DR is enabled */
214 	if (dr_is_disabled(DR_TYPE_MEM))
215 		return (ENOTSUP);
216 
217 	if ((status = dr_mem_init()) != 0) {
218 		cmn_err(CE_NOTE, "Memory DR initialization failed");
219 		return (status);
220 	}
221 
222 	if ((status = mod_install(&modlinkage)) != 0) {
223 		(void) dr_mem_fini();
224 	}
225 
226 	return (status);
227 }
228 
229 int
230 _info(struct modinfo *modinfop)
231 {
232 	return (mod_info(&modlinkage, modinfop));
233 }
234 
235 int
236 _fini(void)
237 {
238 	int	status;
239 
240 	if (dr_mem_allow_unload == 0)
241 		return (EBUSY);
242 
243 	if ((status = mod_remove(&modlinkage)) == 0) {
244 		(void) dr_mem_fini();
245 	}
246 
247 	return (status);
248 }
249 
250 static int
251 dr_mem_init(void)
252 {
253 	int rv;
254 
255 	if ((rv = ds_cap_init(&dr_mem_cap, &dr_mem_ops)) != 0) {
256 		cmn_err(CE_NOTE, "dr_mem: ds_cap_init failed: %d", rv);
257 		return (rv);
258 	}
259 
260 	return (0);
261 }
262 
263 static int
264 dr_mem_fini(void)
265 {
266 	int rv;
267 
268 	if ((rv = ds_cap_fini(&dr_mem_cap)) != 0) {
269 		cmn_err(CE_NOTE, "dr_mem: ds_cap_fini failed: %d", rv);
270 	}
271 
272 	return (rv);
273 }
274 
275 static void
276 dr_mem_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
277 {
278 	DR_DBG_MEM("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg,
279 	    ver->major, ver->minor, hdl);
280 
281 	ds_handle = hdl;
282 }
283 
284 static void
285 dr_mem_unreg_handler(ds_cb_arg_t arg)
286 {
287 	DR_DBG_MEM("unreg_handler: arg=0x%p\n", arg);
288 
289 	ds_handle = DS_INVALID_HDL;
290 }
291 
292 /*ARGSUSED*/
293 static void
294 dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
295 {
296 	dr_mem_hdr_t	*req = buf;
297 	dr_mem_hdr_t	err_resp;
298 	dr_mem_hdr_t	*resp = &err_resp;
299 	int		resp_len = 0;
300 	int		rv = EINVAL;
301 
302 	/*
303 	 * Sanity check the message
304 	 */
305 	if (buflen < sizeof (dr_mem_hdr_t)) {
306 		DR_DBG_MEM("incoming message short: expected at least %ld "
307 		    "bytes, received %ld\n", sizeof (dr_mem_hdr_t), buflen);
308 		goto done;
309 	}
310 
311 	if (req == NULL) {
312 		DR_DBG_MEM("empty message: expected at least %ld bytes\n",
313 		    sizeof (dr_mem_hdr_t));
314 		goto done;
315 	}
316 
317 	DR_DBG_MEM("incoming request:\n");
318 	DR_DBG_DUMP_MSG(buf, buflen);
319 
320 	/*
321 	 * Process the command
322 	 */
323 	switch (req->msg_type) {
324 	case DR_MEM_CONFIGURE:
325 	case DR_MEM_UNCONFIGURE:
326 		if (req->msg_arg == 0) {
327 			DR_DBG_MEM("No mblks specified for operation\n");
328 			goto done;
329 		}
330 		if ((rv = dr_mem_list_wrk(req, &resp, &resp_len)) != 0) {
331 			DR_DBG_MEM("%s failed (%d)\n",
332 			    (req->msg_type == DR_MEM_CONFIGURE) ?
333 			    "Memory configure" : "Memory unconfigure", rv);
334 		}
335 		break;
336 
337 	case DR_MEM_UNCONF_STATUS:
338 		if ((rv = dr_mem_del_stat(req, &resp, &resp_len)) != 0)
339 			DR_DBG_MEM("Memory delete status failed (%d)\n", rv);
340 		break;
341 
342 	case DR_MEM_UNCONF_CANCEL:
343 		if ((rv = dr_mem_del_cancel(req, &resp, &resp_len)) != 0)
344 			DR_DBG_MEM("Memory delete cancel failed (%d)\n", rv);
345 		break;
346 
347 	case DR_MEM_QUERY:
348 		if (req->msg_arg == 0) {
349 			DR_DBG_MEM("No mblks specified for operation\n");
350 			goto done;
351 		}
352 		if ((rv = dr_mem_list_query(req, &resp, &resp_len)) != 0)
353 			DR_DBG_MEM("Memory query failed (%d)\n", rv);
354 		break;
355 
356 	default:
357 		cmn_err(CE_NOTE, "unsupported memory DR operation (%d)",
358 		    req->msg_type);
359 		break;
360 	}
361 
362 done:
363 	/* check if an error occurred */
364 	if (resp == &err_resp) {
365 		resp->req_num = (req) ? req->req_num : 0;
366 		resp->msg_type = DR_MEM_ERROR;
367 		resp->msg_arg = rv;
368 		resp_len = sizeof (dr_mem_hdr_t);
369 	}
370 
371 	DR_DBG_MEM("outgoing response:\n");
372 	DR_DBG_DUMP_MSG(resp, resp_len);
373 
374 	/* send back the response */
375 	if (ds_cap_send(ds_handle, resp, resp_len) != 0) {
376 		DR_DBG_MEM("ds_send failed\n");
377 	}
378 
379 	/* free any allocated memory */
380 	if (resp != &err_resp) {
381 		kmem_free(resp, resp_len);
382 	}
383 }
384 
385 /*
386  * Common routine to config or unconfig multiple mblks.
387  *
388  * Note: Do not modify result buffer or length on error.
389  */
390 static int
391 dr_mem_list_wrk(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
392 {
393 	int		rv;
394 	int		idx;
395 	int		count;
396 	int		result;
397 	int		status;
398 	fn_t		dr_fn;
399 	int		se_hint;
400 	dr_mem_blk_t	*req_mblks;
401 	dr_mem_res_t	*res;
402 	int		drctl_cmd;
403 	int		drctl_flags = 0;
404 	drctl_rsrc_t	*drctl_req;
405 	size_t		drctl_req_len;
406 	drctl_resp_t	*drctl_resp;
407 	drctl_rsrc_t	*drctl_rsrc;
408 	size_t		drctl_resp_len = 0;
409 	drctl_cookie_t	drctl_res_ck;
410 
411 	ASSERT((req != NULL) && (req->msg_arg != 0));
412 
413 	count = req->msg_arg;
414 
415 	/*
416 	 * Extract all information that is specific
417 	 * to the various types of operations.
418 	 */
419 	switch (req->msg_type) {
420 	case DR_MEM_CONFIGURE:
421 		dr_fn = dr_mem_configure;
422 		drctl_cmd = DRCTL_MEM_CONFIG_REQUEST;
423 		se_hint = SE_HINT_INSERT;
424 		break;
425 	case DR_MEM_UNCONFIGURE:
426 		dr_fn = dr_mem_unconfigure;
427 		drctl_cmd = DRCTL_MEM_UNCONFIG_REQUEST;
428 		se_hint = SE_HINT_REMOVE;
429 		break;
430 	default:
431 		/* Programming error if we reach this. */
432 		cmn_err(CE_NOTE, "%s: bad msg_type %d\n",
433 		    __func__, req->msg_type);
434 		ASSERT(0);
435 		return (-1);
436 	}
437 
438 	/* the incoming array of mblks to operate on */
439 	req_mblks = DR_MEM_CMD_MBLKS(req);
440 
441 	/* allocate drctl request msg based on incoming resource count */
442 	drctl_req_len = sizeof (drctl_rsrc_t) * count;
443 	drctl_req = kmem_zalloc(drctl_req_len, KM_SLEEP);
444 
445 	/* copy the size for the drctl call from the incoming request msg */
446 	for (idx = 0; idx < count; idx++) {
447 		drctl_req[idx].res_mem_addr = req_mblks[idx].addr;
448 		drctl_req[idx].res_mem_size = req_mblks[idx].size;
449 	}
450 
451 	rv = drctl_config_init(drctl_cmd, drctl_flags, drctl_req,
452 	    count, &drctl_resp, &drctl_resp_len, &drctl_res_ck);
453 
454 	ASSERT((drctl_resp != NULL) && (drctl_resp_len != 0));
455 
456 	if (rv != 0) {
457 		DR_DBG_MEM("%s: drctl_config_init returned: %d\n",
458 		    __func__, rv);
459 		kmem_free(drctl_resp, drctl_resp_len);
460 		kmem_free(drctl_req, drctl_req_len);
461 		return (rv);
462 	}
463 
464 	ASSERT(drctl_resp->resp_type == DRCTL_RESP_OK);
465 
466 	drctl_rsrc = drctl_resp->resp_resources;
467 
468 	/* create the result scratch array */
469 	res = dr_mem_res_array_init(req, drctl_rsrc, count);
470 
471 	/* perform the specified operation on each of the mblks */
472 	for (idx = 0; idx < count; idx++) {
473 		/*
474 		 * If no action will be taken against the current
475 		 * mblk, update the drctl resource information to
476 		 * ensure that it gets recovered properly during
477 		 * the drctl fini() call.
478 		 */
479 		if (res[idx].result != DR_MEM_RES_OK) {
480 			drctl_req[idx].status = DRCTL_STATUS_CONFIG_FAILURE;
481 			continue;
482 		}
483 
484 		/* call the function to perform the actual operation */
485 		result = (*dr_fn)(&req_mblks[idx], &status);
486 
487 		/* save off results of the operation */
488 		res[idx].result = result;
489 		res[idx].status = status;
490 		res[idx].addr = req_mblks[idx].addr;	/* for partial case */
491 		res[idx].size = req_mblks[idx].size;	/* for partial case */
492 		res[idx].string = i_ddi_strdup(dr_mem_estr[result], KM_SLEEP);
493 
494 		/* save result for drctl fini() reusing init() msg memory */
495 		drctl_req[idx].status = (result != DR_MEM_RES_OK) ?
496 		    DRCTL_STATUS_CONFIG_FAILURE : DRCTL_STATUS_CONFIG_SUCCESS;
497 
498 		DR_DBG_MEM("%s: mblk 0x%lx.0x%lx stat %d result %d off '%s'\n",
499 		    __func__, req_mblks[idx].addr, req_mblks[idx].size,
500 		    drctl_req[idx].status, result,
501 		    (res[idx].string) ? res[idx].string : "");
502 	}
503 
504 	if ((rv = drctl_config_fini(&drctl_res_ck, drctl_req, count)) != 0)
505 		DR_DBG_MEM("%s: drctl_config_fini returned: %d\n",
506 		    __func__, rv);
507 
508 	/*
509 	 * Operation completed without any fatal errors.
510 	 * Pack the response for transmission.
511 	 */
512 	*resp_len = dr_mem_pack_response(req, res, resp);
513 
514 	/* notify interested parties about the operation */
515 	dr_generate_event(DR_TYPE_MEM, se_hint);
516 
517 	/*
518 	 * Deallocate any scratch memory.
519 	 */
520 	kmem_free(drctl_resp, drctl_resp_len);
521 	kmem_free(drctl_req, drctl_req_len);
522 
523 	dr_mem_res_array_fini(res, count);
524 
525 	return (0);
526 }
527 
528 /*
529  * Allocate and initialize a result array based on the initial
530  * drctl operation. A valid result array is always returned.
531  */
532 static dr_mem_res_t *
533 dr_mem_res_array_init(dr_mem_hdr_t *req, drctl_rsrc_t *rsrc, int nrsrc)
534 {
535 	int		idx;
536 	dr_mem_res_t	*res;
537 	char		*err_str;
538 	size_t		err_len;
539 
540 	/* allocate zero filled buffer to initialize fields */
541 	res = kmem_zalloc(nrsrc * sizeof (dr_mem_res_t), KM_SLEEP);
542 
543 	/*
544 	 * Fill in the result information for each resource.
545 	 */
546 	for (idx = 0; idx < nrsrc; idx++) {
547 		res[idx].addr = rsrc[idx].res_mem_addr;
548 		res[idx].size = rsrc[idx].res_mem_size;
549 		res[idx].result = DR_MEM_RES_OK;
550 
551 		if (rsrc[idx].status == DRCTL_STATUS_ALLOW)
552 			continue;
553 
554 		/*
555 		 * Update the state information for this mblk.
556 		 */
557 		res[idx].result = DR_MEM_RES_BLOCKED;
558 		res[idx].status = (req->msg_type == DR_MEM_CONFIGURE) ?
559 		    DR_MEM_STAT_UNCONFIGURED : DR_MEM_STAT_CONFIGURED;
560 
561 		/*
562 		 * If an error string exists, copy it out of the
563 		 * message buffer. This eliminates any dependency
564 		 * on the memory allocated for the message buffer
565 		 * itself.
566 		 */
567 		if (rsrc[idx].offset != NULL) {
568 			err_str = (char *)rsrc + rsrc[idx].offset;
569 			err_len = strlen(err_str) + 1;
570 
571 			res[idx].string = kmem_alloc(err_len, KM_SLEEP);
572 			bcopy(err_str, res[idx].string, err_len);
573 		}
574 	}
575 
576 	return (res);
577 }
578 
579 static void
580 dr_mem_res_array_fini(dr_mem_res_t *res, int nres)
581 {
582 	int	idx;
583 	size_t	str_len;
584 
585 	for (idx = 0; idx < nres; idx++) {
586 		/* deallocate the error string if present */
587 		if (res[idx].string) {
588 			str_len = strlen(res[idx].string) + 1;
589 			kmem_free(res[idx].string, str_len);
590 		}
591 	}
592 
593 	/* deallocate the result array itself */
594 	kmem_free(res, sizeof (dr_mem_res_t) * nres);
595 }
596 
597 /*
598  * Allocate and pack a response message for transmission based
599  * on the specified result array. A valid response message and
600  * valid size information is always returned.
601  */
602 static size_t
603 dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res, dr_mem_hdr_t **respp)
604 {
605 	int		idx;
606 	dr_mem_hdr_t	*resp;
607 	dr_mem_stat_t	*resp_stat;
608 	size_t		resp_len;
609 	uint32_t	curr_off;
610 	caddr_t		curr_str;
611 	size_t		str_len;
612 	size_t		stat_len;
613 	int		nstat = req->msg_arg;
614 
615 	/*
616 	 * Calculate the size of the response message
617 	 * and allocate an appropriately sized buffer.
618 	 */
619 	resp_len = sizeof (dr_mem_hdr_t);
620 
621 	/* add the stat array size */
622 	stat_len = sizeof (dr_mem_stat_t) * nstat;
623 	resp_len += stat_len;
624 
625 	/* add the size of any error strings */
626 	for (idx = 0; idx < nstat; idx++) {
627 		if (res[idx].string != NULL) {
628 			resp_len += strlen(res[idx].string) + 1;
629 		}
630 	}
631 
632 	/* allocate the message buffer */
633 	resp = kmem_zalloc(resp_len, KM_SLEEP);
634 
635 	/*
636 	 * Fill in the header information.
637 	 */
638 	resp->req_num = req->req_num;
639 	resp->msg_type = DR_MEM_OK;
640 	resp->msg_arg = nstat;
641 
642 	/*
643 	 * Fill in the stat information.
644 	 */
645 	resp_stat = DR_MEM_RESP_STATS(resp);
646 
647 	/* string offsets start immediately after stat array */
648 	curr_off = sizeof (dr_mem_hdr_t) + stat_len;
649 	curr_str = (char *)resp_stat + stat_len;
650 
651 	for (idx = 0; idx < nstat; idx++) {
652 		resp_stat[idx].addr = res[idx].addr;
653 		resp_stat[idx].size = res[idx].size;
654 		resp_stat[idx].result = res[idx].result;
655 		resp_stat[idx].status = res[idx].status;
656 
657 		if (res[idx].string != NULL) {
658 			/* copy over the error string */
659 			str_len = strlen(res[idx].string) + 1;
660 			bcopy(res[idx].string, curr_str, str_len);
661 			resp_stat[idx].string_off = curr_off;
662 
663 			curr_off += str_len;
664 			curr_str += str_len;
665 		}
666 	}
667 
668 	/* buffer should be exactly filled */
669 	ASSERT(curr_off == resp_len);
670 
671 	*respp = resp;
672 	return (resp_len);
673 }
674 
675 static void
676 dr_mem_query(dr_mem_blk_t *mbp, dr_mem_query_t *mqp)
677 {
678 	memquery_t mq;
679 
680 	DR_DBG_MEM("dr_mem_query...\n");
681 
682 
683 	(void) kphysm_del_span_query(btop(mbp->addr), btop(mbp->size), &mq);
684 
685 	if (!mq.phys_pages)
686 		return;
687 
688 	mqp->addr = mbp->addr;
689 	mqp->mq.phys_pages = ptob(mq.phys_pages);
690 	mqp->mq.managed = ptob(mq.managed);
691 	mqp->mq.nonrelocatable = ptob(mq.nonrelocatable);
692 	mqp->mq.first_nonrelocatable = ptob(mq.first_nonrelocatable);
693 	mqp->mq.last_nonrelocatable = ptob(mq.last_nonrelocatable);
694 	/*
695 	 * Set to the max byte offset within the page.
696 	 */
697 	if (mqp->mq.nonrelocatable)
698 		mqp->mq.last_nonrelocatable += PAGESIZE - 1;
699 }
700 
701 /*
702  * Do not modify result buffer or length on error.
703  */
704 static int
705 dr_mem_list_query(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
706 {
707 	int		idx;
708 	int		rlen;
709 	int		nml;
710 	struct memlist	*ml;
711 	struct memlist	*phys_copy = NULL;
712 	dr_mem_blk_t	*req_mblks, mb;
713 	dr_mem_hdr_t	*rp;
714 	dr_mem_query_t	*stat;
715 
716 	drctl_block();
717 
718 	/* the incoming array of req_mblks to configure */
719 	req_mblks = DR_MEM_CMD_MBLKS(req);
720 
721 	/* allocate a response message, should be freed by caller */
722 	nml = 0;
723 	rlen = sizeof (dr_mem_hdr_t);
724 	if (req_mblks->addr == NULL && req_mblks->size == 0) {
725 		/*
726 		 * Request is for domain's full view of it's memory.
727 		 * place a copy in phys_copy then release the memlist lock.
728 		 */
729 		memlist_read_lock();
730 		phys_copy = dr_memlist_dup(phys_install);
731 		memlist_read_unlock();
732 
733 		for (ml = phys_copy; ml; ml = ml->ml_next)
734 			nml++;
735 
736 		rlen += nml * sizeof (dr_mem_query_t);
737 	} else {
738 		rlen += req->msg_arg * sizeof (dr_mem_query_t);
739 	}
740 	rp = kmem_zalloc(rlen, KM_SLEEP);
741 
742 	/* fill in the known data */
743 	rp->req_num = req->req_num;
744 	rp->msg_type = DR_MEM_OK;
745 	rp->msg_arg = nml ? nml : req->msg_arg;
746 
747 	/* stat array for the response */
748 	stat = DR_MEM_RESP_QUERY(rp);
749 
750 	/* get the status for each of the mblocks */
751 	if (nml) {
752 		for (idx = 0, ml = phys_copy; ml; ml = ml->ml_next, idx++) {
753 			mb.addr = ml->ml_address;
754 			mb.size = ml->ml_size;
755 			dr_mem_query(&mb, &stat[idx]);
756 		}
757 	} else {
758 		for (idx = 0; idx < req->msg_arg; idx++)
759 			dr_mem_query(&req_mblks[idx], &stat[idx]);
760 	}
761 
762 	*resp = rp;
763 	*resp_len = rlen;
764 	if (phys_copy != NULL) {
765 		dr_memlist_delete(phys_copy);
766 	}
767 	drctl_unblock();
768 
769 	return (0);
770 }
771 
772 static int
773 cvt_err(int err)
774 {
775 	int rv;
776 
777 	switch (err) {
778 	case KPHYSM_OK:
779 		rv = DR_MEM_RES_OK;
780 		break;
781 	case KPHYSM_ESPAN:
782 		rv = DR_MEM_RES_ESPAN;
783 		break;
784 	case KPHYSM_EFAULT:
785 		rv = DR_MEM_RES_EFAULT;
786 		break;
787 	case KPHYSM_ERESOURCE:
788 		rv = DR_MEM_RES_ERESOURCE;
789 		break;
790 	case KPHYSM_ENOTSUP:
791 	case KPHYSM_ENOHANDLES:
792 		rv = DR_MEM_RES_FAILURE;
793 		break;
794 	case KPHYSM_ENONRELOC:
795 		rv = DR_MEM_RES_PERM;
796 		break;
797 	case KPHYSM_EHANDLE:
798 		rv = DR_MEM_RES_FAILURE;
799 		break;
800 	case KPHYSM_EBUSY:
801 		rv = DR_MEM_RES_EBUSY;
802 		break;
803 	case KPHYSM_ENOTVIABLE:
804 		rv = DR_MEM_RES_ENOTVIABLE;
805 		break;
806 	case KPHYSM_ESEQUENCE:
807 		rv = DR_MEM_RES_FAILURE;
808 		break;
809 	case KPHYSM_ENOWORK:
810 		rv = DR_MEM_RES_ENOWORK;
811 		break;
812 	case KPHYSM_ECANCELLED:
813 		rv = DR_MEM_RES_ECANCELLED;
814 		break;
815 	case KPHYSM_EREFUSED:
816 		rv = DR_MEM_RES_EREFUSED;
817 		break;
818 	case KPHYSM_ENOTFINISHED:
819 	case KPHYSM_ENOTRUNNING:
820 		rv = DR_MEM_RES_FAILURE;
821 		break;
822 	case KPHYSM_EDUP:
823 		rv = DR_MEM_RES_EDUP;
824 		break;
825 	default:
826 		rv = DR_MEM_RES_FAILURE;
827 		break;
828 	}
829 
830 	return (rv);
831 }
832 
833 static int
834 dr_mem_configure(dr_mem_blk_t *mbp, int *status)
835 {
836 	int rv;
837 	uint64_t addr, size;
838 
839 	rv = 0;
840 	addr = mbp->addr;
841 	size = mbp->size;
842 
843 	DR_DBG_MEM("dr_mem_configure...\n");
844 
845 	if (!MBLK_IS_VALID(mbp)) {
846 		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n", addr, size);
847 		*status = DR_MEM_STAT_UNCONFIGURED;
848 		rv = DR_MEM_RES_EINVAL;
849 	} else if (rv = dr_mem_find(mbp)) {
850 		DR_DBG_MEM("failed to find mblk 0x%lx.0x%lx (%d)\n",
851 		    addr, size, rv);
852 		if (rv == EINVAL) {
853 			*status = DR_MEM_STAT_NOT_PRESENT;
854 			rv = DR_MEM_RES_NOT_IN_MD;
855 		} else {
856 			*status = DR_MEM_STAT_UNCONFIGURED;
857 			rv = DR_MEM_RES_FAILURE;
858 		}
859 	} else {
860 		rv = mem_add(btop(addr), btop(size));
861 		DR_DBG_MEM("addr=0x%lx size=0x%lx rv=%d\n", addr, size, rv);
862 		if (rv) {
863 			*status = DR_MEM_STAT_UNCONFIGURED;
864 		} else {
865 			*status = DR_MEM_STAT_CONFIGURED;
866 		}
867 	}
868 
869 	return (rv);
870 }
871 
872 static int
873 dr_mem_unconfigure(dr_mem_blk_t *mbp, int *status)
874 {
875 	int rv;
876 
877 	DR_DBG_MEM("dr_mem_unconfigure...\n");
878 
879 	if (!MBLK_IS_VALID(mbp)) {
880 		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n",
881 		    mbp->addr, mbp->size);
882 			*status = DR_MEM_STAT_CONFIGURED;
883 			rv = DR_MEM_RES_EINVAL;
884 	} else if (rv = mem_del(btop(mbp->addr), btop(mbp->size))) {
885 		*status = DR_MEM_STAT_CONFIGURED;
886 	} else {
887 		*status = DR_MEM_STAT_UNCONFIGURED;
888 		rv = DR_MEM_RES_OK;
889 		DR_DBG_MEM("mblk 0x%lx.0x%lx unconfigured\n",
890 		    mbp->addr, mbp->size);
891 	}
892 	return (rv);
893 }
894 
895 static int
896 dr_mem_del_stat(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
897 {
898 	int			status;
899 	int			rlen;
900 	memdelstat_t		del_stat, *stat;
901 	dr_mem_hdr_t		*rp;
902 
903 	/*
904 	 * If a mem delete is in progress, get its status.
905 	 */
906 	status = (dr_mh && (kphysm_del_status(dr_mh, &del_stat) == KPHYSM_OK));
907 
908 	/* allocate a response message, should be freed by caller */
909 	rlen = sizeof (dr_mem_hdr_t);
910 	rlen += status * sizeof (memdelstat_t);
911 	rp = kmem_zalloc(rlen, KM_SLEEP);
912 
913 	/* fill in the known data */
914 	rp->req_num = req->req_num;
915 	rp->msg_type = DR_MEM_OK;
916 	rp->msg_arg = status;
917 
918 	if (status) {
919 		/* stat struct for the response */
920 		stat = DR_MEM_RESP_DEL_STAT(rp);
921 		stat->phys_pages = ptob(del_stat.phys_pages);
922 		stat->managed = ptob(del_stat.managed);
923 		stat->collected = ptob(del_stat.collected);
924 	}
925 
926 	*resp = rp;
927 	*resp_len = rlen;
928 
929 	return (0);
930 }
931 
932 static int
933 dr_mem_del_cancel(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
934 {
935 	int		rlen;
936 	dr_mem_hdr_t	*rp;
937 
938 	/* allocate a response message, should be freed by caller */
939 	rlen = sizeof (dr_mem_hdr_t);
940 	rp = kmem_zalloc(rlen, KM_SLEEP);
941 
942 	/* fill in the known data */
943 	rp->req_num = req->req_num;
944 	rp->msg_type = DR_MEM_OK;
945 	rp->msg_arg = (dr_mh && kphysm_del_cancel(dr_mh) != KPHYSM_OK) ?
946 	    DR_MEM_RES_EINVAL : DR_MEM_RES_OK;
947 
948 	*resp = rp;
949 	*resp_len = rlen;
950 
951 	return (0);
952 }
953 
954 static int
955 dr_mem_find(dr_mem_blk_t *mbp)
956 {
957 	md_t		*mdp = NULL;
958 	int		num_nodes;
959 	int		rv = 0;
960 	int		listsz;
961 	mde_cookie_t	*listp = NULL;
962 	mde_cookie_t	memnode;
963 	char		*found = "found";
964 
965 	if ((mdp = md_get_handle()) == NULL) {
966 		DR_DBG_MEM("unable to initialize machine description\n");
967 		return (-1);
968 	}
969 
970 	num_nodes = md_node_count(mdp);
971 	ASSERT(num_nodes > 0);
972 
973 	listsz = num_nodes * sizeof (mde_cookie_t);
974 	listp = kmem_zalloc(listsz, KM_SLEEP);
975 
976 	memnode = dr_mem_find_node_md(mbp, mdp, listp);
977 
978 	if (memnode == MDE_INVAL_ELEM_COOKIE) {
979 		rv = EINVAL;
980 		found = "not found";
981 	}
982 
983 	DR_DBG_MEM("mblk 0x%lx.0x%lx %s\n", mbp->addr, mbp->size, found);
984 
985 	kmem_free(listp, listsz);
986 	(void) md_fini_handle(mdp);
987 
988 	return (rv);
989 }
990 
991 /*
992  * Look up a particular mblk in the MD. Returns the mde_cookie_t
993  * representing that mblk if present, and MDE_INVAL_ELEM_COOKIE
994  * otherwise. It is assumed the scratch array has already been
995  * allocated so that it can accommodate the worst case scenario,
996  * every node in the MD.
997  */
998 static mde_cookie_t
999 dr_mem_find_node_md(dr_mem_blk_t *mbp, md_t *mdp, mde_cookie_t *listp)
1000 {
1001 	int		idx;
1002 	int		nnodes;
1003 	mde_cookie_t	rootnode;
1004 	uint64_t	base_prop;
1005 	uint64_t	size_prop;
1006 	mde_cookie_t	result = MDE_INVAL_ELEM_COOKIE;
1007 
1008 	rootnode = md_root_node(mdp);
1009 	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
1010 
1011 	/*
1012 	 * Scan the DAG for all the mem nodes
1013 	 */
1014 	nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "mblock"),
1015 	    md_find_name(mdp, "fwd"), listp);
1016 
1017 	if (nnodes < 0) {
1018 		DR_DBG_MEM("Scan for mblks failed\n");
1019 		return (result);
1020 	}
1021 
1022 	DR_DBG_MEM("dr_mem_find_node_md: found %d mblks in the MD\n", nnodes);
1023 
1024 	/*
1025 	 * Find the mblk of interest
1026 	 */
1027 	for (idx = 0; idx < nnodes; idx++) {
1028 
1029 		if (md_get_prop_val(mdp, listp[idx], "base", &base_prop)) {
1030 			DR_DBG_MEM("Missing 'base' property for mblk node %d\n",
1031 			    idx);
1032 			break;
1033 		}
1034 
1035 		if (md_get_prop_val(mdp, listp[idx], "size", &size_prop)) {
1036 			DR_DBG_MEM("Missing 'size' property for mblk node %d\n",
1037 			    idx);
1038 			break;
1039 		}
1040 
1041 		if (base_prop <= mbp->addr &&
1042 		    (base_prop + size_prop) >= (mbp->addr + mbp->size)) {
1043 			/* found a match */
1044 			DR_DBG_MEM("dr_mem_find_node_md: found mblk "
1045 			    "0x%lx.0x%lx in MD\n", mbp->addr, mbp->size);
1046 			result = listp[idx];
1047 			break;
1048 		}
1049 	}
1050 
1051 	if (result == MDE_INVAL_ELEM_COOKIE) {
1052 		DR_DBG_MEM("mblk 0x%lx.0x%lx not in MD\n",
1053 		    mbp->addr, mbp->size);
1054 	}
1055 
1056 	return (result);
1057 }
1058 
1059 static int
1060 mem_add(pfn_t base, pgcnt_t npgs)
1061 {
1062 	int rv, rc;
1063 
1064 	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);
1065 
1066 	if (npgs == 0)
1067 		return (DR_MEM_RES_OK);
1068 
1069 	rv = kphysm_add_memory_dynamic(base, npgs);
1070 	DR_DBG_MEM("%s: kphysm_add(0x%lx, 0x%lx) = %d", __func__, base, npgs,
1071 	    rv);
1072 	if (rv == KPHYSM_OK) {
1073 		if (rc = kcage_range_add(base, npgs, KCAGE_DOWN))
1074 			cmn_err(CE_WARN, "kcage_range_add() = %d", rc);
1075 	}
1076 	rv = cvt_err(rv);
1077 	return (rv);
1078 }
1079 
1080 static void
1081 del_done(void *arg, int error)
1082 {
1083 	mem_sync_t *ms = arg;
1084 
1085 	mutex_enter(&ms->lock);
1086 	ms->error = error;
1087 	ms->done = 1;
1088 	cv_signal(&ms->cond);
1089 	mutex_exit(&ms->lock);
1090 }
1091 
1092 static int
1093 mem_del(pfn_t base, pgcnt_t npgs)
1094 {
1095 	int rv, err, del_range = 0;
1096 	int convert = 1;
1097 	mem_sync_t ms;
1098 	memquery_t mq;
1099 	memhandle_t mh;
1100 	struct memlist *ml;
1101 	struct memlist *d_ml = NULL;
1102 
1103 	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);
1104 
1105 	if (npgs == 0)
1106 		return (DR_MEM_RES_OK);
1107 
1108 	if ((rv = kphysm_del_gethandle(&mh)) != KPHYSM_OK) {
1109 		cmn_err(CE_WARN, "%s: del_gethandle() = %d", __func__, rv);
1110 		rv = cvt_err(rv);
1111 		return (rv);
1112 	}
1113 	if ((rv = kphysm_del_span_query(base, npgs, &mq))
1114 	    != KPHYSM_OK) {
1115 		cmn_err(CE_WARN, "%s: del_span_query() = %d", __func__, rv);
1116 		goto done;
1117 	}
1118 	if (mq.nonrelocatable) {
1119 		DR_DBG_MEM("%s: non-reloc pages = %ld",
1120 		    __func__, mq.nonrelocatable);
1121 		rv  = KPHYSM_ENONRELOC;
1122 		goto done;
1123 	}
1124 	if (rv = kcage_range_delete(base, npgs)) {
1125 		switch (rv) {
1126 		case EBUSY:
1127 			rv = DR_MEM_RES_ENOTVIABLE;
1128 			break;
1129 		default:
1130 			rv = DR_MEM_RES_FAILURE;
1131 			break;
1132 		}
1133 		convert = 0; /* conversion done */
1134 		cmn_err(CE_WARN, "%s: del_range() = %d", __func__, rv);
1135 		goto done;
1136 	} else {
1137 		del_range++;
1138 	}
1139 	if ((rv = kphysm_del_span(mh, base, npgs)) != KPHYSM_OK) {
1140 		cmn_err(CE_WARN, "%s: del_span() = %d", __func__, rv);
1141 		goto done;
1142 	}
1143 	if ((rv = memlist_add_span(ptob(base), ptob(npgs), &d_ml))
1144 	    != MEML_SPANOP_OK) {
1145 		switch (rv) {
1146 		case MEML_SPANOP_ESPAN:
1147 			rv = DR_MEM_RES_ESPAN;
1148 			break;
1149 		case MEML_SPANOP_EALLOC:
1150 			rv = DR_MEM_RES_ERESOURCE;
1151 			break;
1152 		default:
1153 			rv = DR_MEM_RES_FAILURE;
1154 			break;
1155 		}
1156 		convert = 0; /* conversion done */
1157 		cmn_err(CE_WARN, "%s: add_span() = %d", __func__, rv);
1158 		goto done;
1159 	}
1160 
1161 	DR_DBG_MEM("%s: reserved=0x%lx", __func__, npgs);
1162 
1163 	bzero((void *) &ms, sizeof (ms));
1164 
1165 	mutex_init(&ms.lock, NULL, MUTEX_DRIVER, NULL);
1166 	cv_init(&ms.cond, NULL, CV_DRIVER, NULL);
1167 	mutex_enter(&ms.lock);
1168 
1169 	if ((rv = kphysm_del_start(mh, del_done, (void *) &ms)) == KPHYSM_OK) {
1170 		/*
1171 		 * Since we've called drctl_config_init, we are the only
1172 		 * DR ctl operation in progress.  Set dr_mh to the
1173 		 * delete memhandle for use by stat and cancel.
1174 		 */
1175 		ASSERT(dr_mh == NULL);
1176 		dr_mh = mh;
1177 
1178 		/*
1179 		 * Wait for completion or interrupt.
1180 		 */
1181 		while (!ms.done) {
1182 			if (cv_wait_sig(&ms.cond, &ms.lock) == 0) {
1183 				/*
1184 				 * There is a pending signal.
1185 				 */
1186 				(void) kphysm_del_cancel(mh);
1187 				DR_DBG_MEM("%s: cancel", __func__);
1188 				/*
1189 				 * Wait for completion.
1190 				 */
1191 				while (!ms.done)
1192 					cv_wait(&ms.cond, &ms.lock);
1193 			}
1194 		}
1195 		dr_mh = NULL;
1196 		rv = ms.error;
1197 	} else {
1198 		DR_DBG_MEM("%s: del_start() = %d", __func__, rv);
1199 	}
1200 
1201 	mutex_exit(&ms.lock);
1202 	cv_destroy(&ms.cond);
1203 	mutex_destroy(&ms.lock);
1204 
1205 done:
1206 	if (rv && del_range) {
1207 		/*
1208 		 * Add back the spans to the kcage growth list.
1209 		 */
1210 		for (ml = d_ml; ml; ml = ml->ml_next)
1211 			if (err = kcage_range_add(btop(ml->ml_address),
1212 			    btop(ml->ml_size), KCAGE_DOWN))
1213 				cmn_err(CE_WARN, "kcage_range_add() = %d", err);
1214 	}
1215 	memlist_free_list(d_ml);
1216 
1217 	if ((err = kphysm_del_release(mh)) != KPHYSM_OK)
1218 		cmn_err(CE_WARN, "%s: del_release() = %d", __func__, err);
1219 	if (convert)
1220 		rv = cvt_err(rv);
1221 
1222 	DR_DBG_MEM("%s: rv=%d", __func__, rv);
1223 
1224 	return (rv);
1225 }
1226