xref: /titanic_50/usr/src/uts/sun4v/io/dr_mem.c (revision a0563a48b6bba0177dc249048ea515ca080c73af)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * sun4v Memory DR Module
28  */
29 
30 
31 #include <sys/types.h>
32 #include <sys/cmn_err.h>
33 #include <sys/vmem.h>
34 #include <sys/kmem.h>
35 #include <sys/systm.h>
36 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
37 #include <sys/errno.h>
38 #include <sys/memnode.h>
39 #include <sys/memlist.h>
40 #include <sys/memlist_impl.h>
41 #include <sys/tuneable.h>
42 #include <sys/proc.h>
43 #include <sys/disp.h>
44 #include <sys/debug.h>
45 #include <sys/vm.h>
46 #include <sys/callb.h>
47 #include <sys/memlist_plat.h>	/* for installed_top_size() */
48 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
49 #include <sys/dumphdr.h>	/* for dump_resize() */
50 #include <sys/atomic.h>		/* for use in stats collection */
51 #include <sys/rwlock.h>
52 #include <vm/seg_kmem.h>
53 #include <vm/seg_kpm.h>
54 #include <vm/page.h>
55 #include <vm/vm_dep.h>
56 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
57 #include <sys/sunddi.h>
58 #include <sys/mem_config.h>
59 #include <sys/mem_cage.h>
60 #include <sys/lgrp.h>
61 #include <sys/ddi.h>
62 
63 #include <sys/modctl.h>
64 #include <sys/sysevent/dr.h>
65 #include <sys/mach_descrip.h>
66 #include <sys/mdesc.h>
67 #include <sys/ds.h>
68 #include <sys/drctl.h>
69 #include <sys/dr_util.h>
70 #include <sys/dr_mem.h>
71 #include <sys/suspend.h>
72 
73 
74 /*
75  * DR operations are subject to Memory Alignment restrictions
76  * for both address and the size of the request.
77  */
78 #define	MA_ADDR	0x10000000	/* addr alignment 256M */
79 #define	MA_SIZE	0x10000000	/* size alignment 256M */
80 
81 #define	MBLK_IS_VALID(m) \
82 	(IS_P2ALIGNED((m)->addr, MA_ADDR) && IS_P2ALIGNED((m)->size, MA_SIZE))
83 
84 static memhandle_t dr_mh;	/* memory handle for delete */
85 
86 static struct modlmisc modlmisc = {
87 	&mod_miscops,
88 	"sun4v memory DR"
89 };
90 
91 static struct modlinkage modlinkage = {
92 	MODREV_1,
93 	(void *)&modlmisc,
94 	NULL
95 };
96 
97 static int dr_mem_allow_unload = 0;
98 
99 typedef int (*fn_t)(dr_mem_blk_t *, int *);
100 
101 /*
102  * Global Domain Services (DS) Handle
103  */
104 static ds_svc_hdl_t ds_handle;
105 
106 /*
107  * Supported DS Capability Versions
108  */
109 static ds_ver_t		dr_mem_vers[] = { { 1, 0 } };
110 #define	DR_MEM_NVERS	(sizeof (dr_mem_vers) / sizeof (dr_mem_vers[0]))
111 
112 /*
113  * DS Capability Description
114  */
115 static ds_capability_t dr_mem_cap = {
116 	DR_MEM_DS_ID,		/* svc_id */
117 	dr_mem_vers,		/* vers */
118 	DR_MEM_NVERS		/* nvers */
119 };
120 
121 /*
122  * DS Callbacks
123  */
124 static void dr_mem_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
125 static void dr_mem_unreg_handler(ds_cb_arg_t arg);
126 static void dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
127 
128 /*
129  * DS Client Ops Vector
130  */
131 static ds_clnt_ops_t dr_mem_ops = {
132 	dr_mem_reg_handler,	/* ds_reg_cb */
133 	dr_mem_unreg_handler,	/* ds_unreg_cb */
134 	dr_mem_data_handler,	/* ds_data_cb */
135 	NULL			/* cb_arg */
136 };
137 
138 /*
139  * Operation Results
140  *
141  * Used internally to gather results while an operation on a
142  * list of mblks is in progress. In particular, it is used to
143  * keep track of which mblks have already failed so that they are
144  * not processed further, and the manner in which they failed.
145  */
146 typedef struct {
147 	uint64_t	addr;
148 	uint64_t	size;
149 	uint32_t	result;
150 	uint32_t	status;
151 	char		*string;
152 } dr_mem_res_t;
153 
154 static char *
155 dr_mem_estr[] = {
156 	"operation succeeded",		/* DR_MEM_RES_OK */
157 	"operation failed",		/* DR_MEM_RES_FAILURE */
158 	"operation was blocked",	/* DR_MEM_RES_BLOCKED */
159 	"memory not defined in MD",	/* DR_MEM_RES_NOT_IN_MD */
160 	"memory already in use",	/* DR_MEM_RES_ESPAN */
161 	"memory access test failed",	/* DR_MEM_RES_EFAULT */
162 	"resource not available",	/* DR_MEM_RES_ERESOURCE */
163 	"permanent pages in span",	/* DR_MEM_RES_PERM */
164 	"memory span busy",		/* DR_MEM_RES_EBUSY */
165 	"VM viability test failed",	/* DR_MEM_RES_ENOTVIABLE */
166 	"no pages to unconfigure",	/* DR_MEM_RES_ENOWORK */
167 	"operation cancelled",		/* DR_MEM_RES_ECANCELLED */
168 	"operation refused",		/* DR_MEM_RES_EREFUSED */
169 	"memory span duplicate",	/* DR_MEM_RES_EDUP */
170 	"invalid argument"		/* DR_MEM_RES_EINVAL */
171 };
172 
173 static char *
174 dr_mem_estr_detail[] = {
175 	"",					/* DR_MEM_SRES_NONE */
176 	"memory DR disabled after migration"	/* DR_MEM_SRES_OS_SUSPENDED */
177 };
178 
179 typedef struct {
180 	kcondvar_t cond;
181 	kmutex_t lock;
182 	int error;
183 	int done;
184 } mem_sync_t;
185 
186 /*
187  * Internal Functions
188  */
189 static int dr_mem_init(void);
190 static int dr_mem_fini(void);
191 
192 static int dr_mem_list_wrk(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
193 static int dr_mem_list_query(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
194 static int dr_mem_del_stat(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
195 static int dr_mem_del_cancel(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
196 
197 static int dr_mem_unconfigure(dr_mem_blk_t *, int *);
198 static int dr_mem_configure(dr_mem_blk_t *, int *);
199 static void dr_mem_query(dr_mem_blk_t *, dr_mem_query_t *);
200 
201 static dr_mem_res_t *dr_mem_res_array_init(dr_mem_hdr_t *, drctl_rsrc_t *, int);
202 static void dr_mem_res_array_fini(dr_mem_res_t *res, int nres);
203 static size_t dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res,
204     dr_mem_hdr_t **respp);
205 
206 static int dr_mem_find(dr_mem_blk_t *mbp);
207 static mde_cookie_t dr_mem_find_node_md(dr_mem_blk_t *, md_t *, mde_cookie_t *);
208 
209 static int mem_add(pfn_t, pgcnt_t);
210 static int mem_del(pfn_t, pgcnt_t);
211 
212 extern int kphysm_add_memory_dynamic(pfn_t, pgcnt_t);
213 
214 int
215 _init(void)
216 {
217 	int	status;
218 
219 	/* check that Memory DR is enabled */
220 	if (dr_is_disabled(DR_TYPE_MEM))
221 		return (ENOTSUP);
222 
223 	if ((status = dr_mem_init()) != 0) {
224 		cmn_err(CE_NOTE, "Memory DR initialization failed");
225 		return (status);
226 	}
227 
228 	if ((status = mod_install(&modlinkage)) != 0) {
229 		(void) dr_mem_fini();
230 	}
231 
232 	return (status);
233 }
234 
235 int
236 _info(struct modinfo *modinfop)
237 {
238 	return (mod_info(&modlinkage, modinfop));
239 }
240 
241 int
242 _fini(void)
243 {
244 	int	status;
245 
246 	if (dr_mem_allow_unload == 0)
247 		return (EBUSY);
248 
249 	if ((status = mod_remove(&modlinkage)) == 0) {
250 		(void) dr_mem_fini();
251 	}
252 
253 	return (status);
254 }
255 
256 static int
257 dr_mem_init(void)
258 {
259 	int rv;
260 
261 	if ((rv = ds_cap_init(&dr_mem_cap, &dr_mem_ops)) != 0) {
262 		cmn_err(CE_NOTE, "dr_mem: ds_cap_init failed: %d", rv);
263 		return (rv);
264 	}
265 
266 	return (0);
267 }
268 
269 static int
270 dr_mem_fini(void)
271 {
272 	int rv;
273 
274 	if ((rv = ds_cap_fini(&dr_mem_cap)) != 0) {
275 		cmn_err(CE_NOTE, "dr_mem: ds_cap_fini failed: %d", rv);
276 	}
277 
278 	return (rv);
279 }
280 
281 static void
282 dr_mem_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
283 {
284 	DR_DBG_MEM("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg,
285 	    ver->major, ver->minor, hdl);
286 
287 	ds_handle = hdl;
288 }
289 
290 static void
291 dr_mem_unreg_handler(ds_cb_arg_t arg)
292 {
293 	DR_DBG_MEM("unreg_handler: arg=0x%p\n", arg);
294 
295 	ds_handle = DS_INVALID_HDL;
296 }
297 
298 /*ARGSUSED*/
299 static void
300 dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
301 {
302 	dr_mem_hdr_t	*req = buf;
303 	dr_mem_hdr_t	err_resp;
304 	dr_mem_hdr_t	*resp = &err_resp;
305 	int		resp_len = 0;
306 	int		rv = EINVAL;
307 
308 	/*
309 	 * Sanity check the message
310 	 */
311 	if (buflen < sizeof (dr_mem_hdr_t)) {
312 		DR_DBG_MEM("incoming message short: expected at least %ld "
313 		    "bytes, received %ld\n", sizeof (dr_mem_hdr_t), buflen);
314 		goto done;
315 	}
316 
317 	if (req == NULL) {
318 		DR_DBG_MEM("empty message: expected at least %ld bytes\n",
319 		    sizeof (dr_mem_hdr_t));
320 		goto done;
321 	}
322 
323 	DR_DBG_MEM("incoming request:\n");
324 	DR_DBG_DUMP_MSG(buf, buflen);
325 
326 	/*
327 	 * Process the command
328 	 */
329 	switch (req->msg_type) {
330 	case DR_MEM_CONFIGURE:
331 	case DR_MEM_UNCONFIGURE:
332 		if (req->msg_arg == 0) {
333 			DR_DBG_MEM("No mblks specified for operation\n");
334 			goto done;
335 		}
336 		if ((rv = dr_mem_list_wrk(req, &resp, &resp_len)) != 0) {
337 			DR_DBG_MEM("%s failed (%d)\n",
338 			    (req->msg_type == DR_MEM_CONFIGURE) ?
339 			    "Memory configure" : "Memory unconfigure", rv);
340 		}
341 		break;
342 
343 	case DR_MEM_UNCONF_STATUS:
344 		if ((rv = dr_mem_del_stat(req, &resp, &resp_len)) != 0)
345 			DR_DBG_MEM("Memory delete status failed (%d)\n", rv);
346 		break;
347 
348 	case DR_MEM_UNCONF_CANCEL:
349 		if ((rv = dr_mem_del_cancel(req, &resp, &resp_len)) != 0)
350 			DR_DBG_MEM("Memory delete cancel failed (%d)\n", rv);
351 		break;
352 
353 	case DR_MEM_QUERY:
354 		if (req->msg_arg == 0) {
355 			DR_DBG_MEM("No mblks specified for operation\n");
356 			goto done;
357 		}
358 		if ((rv = dr_mem_list_query(req, &resp, &resp_len)) != 0)
359 			DR_DBG_MEM("Memory query failed (%d)\n", rv);
360 		break;
361 
362 	default:
363 		cmn_err(CE_NOTE, "unsupported memory DR operation (%d)",
364 		    req->msg_type);
365 		break;
366 	}
367 
368 done:
369 	/* check if an error occurred */
370 	if (resp == &err_resp) {
371 		resp->req_num = (req) ? req->req_num : 0;
372 		resp->msg_type = DR_MEM_ERROR;
373 		resp->msg_arg = rv;
374 		resp_len = sizeof (dr_mem_hdr_t);
375 	}
376 
377 	DR_DBG_MEM("outgoing response:\n");
378 	DR_DBG_DUMP_MSG(resp, resp_len);
379 
380 	/* send back the response */
381 	if (ds_cap_send(ds_handle, resp, resp_len) != 0) {
382 		DR_DBG_MEM("ds_send failed\n");
383 	}
384 
385 	/* free any allocated memory */
386 	if (resp != &err_resp) {
387 		kmem_free(resp, resp_len);
388 	}
389 }
390 
391 static char *
392 dr_mem_get_errstr(int result, int subresult)
393 {
394 	size_t len;
395 	char *errstr;
396 	const char *separator = ": ";
397 
398 	if (subresult == DR_MEM_SRES_NONE)
399 		return (i_ddi_strdup(dr_mem_estr[result], KM_SLEEP));
400 
401 	len = snprintf(NULL, 0, "%s%s%s", dr_mem_estr[result],
402 	    separator, dr_mem_estr_detail[subresult]) + 1;
403 
404 	errstr = kmem_alloc(len, KM_SLEEP);
405 
406 	(void) snprintf(errstr, len, "%s%s%s", dr_mem_estr[result],
407 	    separator, dr_mem_estr_detail[subresult]);
408 
409 	return (errstr);
410 }
411 
412 /*
413  * Common routine to config or unconfig multiple mblks.
414  *
415  * Note: Do not modify result buffer or length on error.
416  */
417 static int
418 dr_mem_list_wrk(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
419 {
420 	int		rv;
421 	int		idx;
422 	int		count;
423 	int		result;
424 	int		subresult;
425 	int		status;
426 	boolean_t	suspend_allows_dr;
427 	fn_t		dr_fn;
428 	int		se_hint;
429 	dr_mem_blk_t	*req_mblks;
430 	dr_mem_res_t	*res;
431 	int		drctl_cmd;
432 	int		drctl_flags = 0;
433 	drctl_rsrc_t	*drctl_req;
434 	size_t		drctl_req_len;
435 	drctl_resp_t	*drctl_resp;
436 	drctl_rsrc_t	*drctl_rsrc;
437 	size_t		drctl_resp_len = 0;
438 	drctl_cookie_t	drctl_res_ck;
439 
440 	ASSERT((req != NULL) && (req->msg_arg != 0));
441 
442 	count = req->msg_arg;
443 
444 	/*
445 	 * Extract all information that is specific
446 	 * to the various types of operations.
447 	 */
448 	switch (req->msg_type) {
449 	case DR_MEM_CONFIGURE:
450 		dr_fn = dr_mem_configure;
451 		drctl_cmd = DRCTL_MEM_CONFIG_REQUEST;
452 		se_hint = SE_HINT_INSERT;
453 		break;
454 	case DR_MEM_UNCONFIGURE:
455 		dr_fn = dr_mem_unconfigure;
456 		drctl_cmd = DRCTL_MEM_UNCONFIG_REQUEST;
457 		se_hint = SE_HINT_REMOVE;
458 		break;
459 	default:
460 		/* Programming error if we reach this. */
461 		cmn_err(CE_NOTE, "%s: bad msg_type %d\n",
462 		    __func__, req->msg_type);
463 		ASSERT(0);
464 		return (-1);
465 	}
466 
467 	/* the incoming array of mblks to operate on */
468 	req_mblks = DR_MEM_CMD_MBLKS(req);
469 
470 	/* allocate drctl request msg based on incoming resource count */
471 	drctl_req_len = sizeof (drctl_rsrc_t) * count;
472 	drctl_req = kmem_zalloc(drctl_req_len, KM_SLEEP);
473 
474 	/* copy the size for the drctl call from the incoming request msg */
475 	for (idx = 0; idx < count; idx++) {
476 		drctl_req[idx].res_mem_addr = req_mblks[idx].addr;
477 		drctl_req[idx].res_mem_size = req_mblks[idx].size;
478 	}
479 
480 	rv = drctl_config_init(drctl_cmd, drctl_flags, drctl_req,
481 	    count, &drctl_resp, &drctl_resp_len, &drctl_res_ck);
482 
483 	ASSERT((drctl_resp != NULL) && (drctl_resp_len != 0));
484 
485 	if (rv != 0) {
486 		DR_DBG_MEM("%s: drctl_config_init returned: %d\n",
487 		    __func__, rv);
488 		kmem_free(drctl_resp, drctl_resp_len);
489 		kmem_free(drctl_req, drctl_req_len);
490 		return (rv);
491 	}
492 
493 	ASSERT(drctl_resp->resp_type == DRCTL_RESP_OK);
494 
495 	drctl_rsrc = drctl_resp->resp_resources;
496 
497 	/* create the result scratch array */
498 	res = dr_mem_res_array_init(req, drctl_rsrc, count);
499 
500 	/*
501 	 * Memory DR operations are not safe if we have been suspended and
502 	 * resumed. Until this limitation is lifted, check to see if memory
503 	 * DR operations are permitted at this time by the suspend subsystem.
504 	 */
505 	if ((suspend_allows_dr = suspend_memdr_allowed()) == B_FALSE) {
506 		result = DR_MEM_RES_BLOCKED;
507 		subresult = DR_MEM_SRES_OS_SUSPENDED;
508 	} else {
509 		subresult = DR_MEM_SRES_NONE;
510 	}
511 
512 	/* perform the specified operation on each of the mblks */
513 	for (idx = 0; idx < count; idx++) {
514 		/*
515 		 * If no action will be taken against the current
516 		 * mblk, update the drctl resource information to
517 		 * ensure that it gets recovered properly during
518 		 * the drctl fini() call.
519 		 */
520 		if (res[idx].result != DR_MEM_RES_OK) {
521 			drctl_req[idx].status = DRCTL_STATUS_CONFIG_FAILURE;
522 			continue;
523 		}
524 
525 		/*
526 		 * If memory DR operations are permitted at this time by
527 		 * the suspend subsystem, call the function to perform the
528 		 * operation, otherwise return a result indicating that the
529 		 * operation was blocked.
530 		 */
531 		if (suspend_allows_dr)
532 			result = (*dr_fn)(&req_mblks[idx], &status);
533 
534 		/* save off results of the operation */
535 		res[idx].result = result;
536 		res[idx].status = status;
537 		res[idx].addr = req_mblks[idx].addr;	/* for partial case */
538 		res[idx].size = req_mblks[idx].size;	/* for partial case */
539 		res[idx].string = dr_mem_get_errstr(result, subresult);
540 
541 		/* save result for drctl fini() reusing init() msg memory */
542 		drctl_req[idx].status = (result != DR_MEM_RES_OK) ?
543 		    DRCTL_STATUS_CONFIG_FAILURE : DRCTL_STATUS_CONFIG_SUCCESS;
544 
545 		DR_DBG_MEM("%s: mblk 0x%lx.0x%lx stat %d result %d off '%s'\n",
546 		    __func__, req_mblks[idx].addr, req_mblks[idx].size,
547 		    drctl_req[idx].status, result,
548 		    (res[idx].string) ? res[idx].string : "");
549 	}
550 
551 	if ((rv = drctl_config_fini(&drctl_res_ck, drctl_req, count)) != 0)
552 		DR_DBG_MEM("%s: drctl_config_fini returned: %d\n",
553 		    __func__, rv);
554 
555 	/*
556 	 * Operation completed without any fatal errors.
557 	 * Pack the response for transmission.
558 	 */
559 	*resp_len = dr_mem_pack_response(req, res, resp);
560 
561 	/* notify interested parties about the operation */
562 	dr_generate_event(DR_TYPE_MEM, se_hint);
563 
564 	/*
565 	 * Deallocate any scratch memory.
566 	 */
567 	kmem_free(drctl_resp, drctl_resp_len);
568 	kmem_free(drctl_req, drctl_req_len);
569 
570 	dr_mem_res_array_fini(res, count);
571 
572 	return (0);
573 }
574 
575 /*
576  * Allocate and initialize a result array based on the initial
577  * drctl operation. A valid result array is always returned.
578  */
579 static dr_mem_res_t *
580 dr_mem_res_array_init(dr_mem_hdr_t *req, drctl_rsrc_t *rsrc, int nrsrc)
581 {
582 	int		idx;
583 	dr_mem_res_t	*res;
584 	char		*err_str;
585 	size_t		err_len;
586 
587 	/* allocate zero filled buffer to initialize fields */
588 	res = kmem_zalloc(nrsrc * sizeof (dr_mem_res_t), KM_SLEEP);
589 
590 	/*
591 	 * Fill in the result information for each resource.
592 	 */
593 	for (idx = 0; idx < nrsrc; idx++) {
594 		res[idx].addr = rsrc[idx].res_mem_addr;
595 		res[idx].size = rsrc[idx].res_mem_size;
596 		res[idx].result = DR_MEM_RES_OK;
597 
598 		if (rsrc[idx].status == DRCTL_STATUS_ALLOW)
599 			continue;
600 
601 		/*
602 		 * Update the state information for this mblk.
603 		 */
604 		res[idx].result = DR_MEM_RES_BLOCKED;
605 		res[idx].status = (req->msg_type == DR_MEM_CONFIGURE) ?
606 		    DR_MEM_STAT_UNCONFIGURED : DR_MEM_STAT_CONFIGURED;
607 
608 		/*
609 		 * If an error string exists, copy it out of the
610 		 * message buffer. This eliminates any dependency
611 		 * on the memory allocated for the message buffer
612 		 * itself.
613 		 */
614 		if (rsrc[idx].offset != NULL) {
615 			err_str = (char *)rsrc + rsrc[idx].offset;
616 			err_len = strlen(err_str) + 1;
617 
618 			res[idx].string = kmem_alloc(err_len, KM_SLEEP);
619 			bcopy(err_str, res[idx].string, err_len);
620 		}
621 	}
622 
623 	return (res);
624 }
625 
626 static void
627 dr_mem_res_array_fini(dr_mem_res_t *res, int nres)
628 {
629 	int	idx;
630 	size_t	str_len;
631 
632 	for (idx = 0; idx < nres; idx++) {
633 		/* deallocate the error string if present */
634 		if (res[idx].string) {
635 			str_len = strlen(res[idx].string) + 1;
636 			kmem_free(res[idx].string, str_len);
637 		}
638 	}
639 
640 	/* deallocate the result array itself */
641 	kmem_free(res, sizeof (dr_mem_res_t) * nres);
642 }
643 
644 /*
645  * Allocate and pack a response message for transmission based
646  * on the specified result array. A valid response message and
647  * valid size information is always returned.
648  */
649 static size_t
650 dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res, dr_mem_hdr_t **respp)
651 {
652 	int		idx;
653 	dr_mem_hdr_t	*resp;
654 	dr_mem_stat_t	*resp_stat;
655 	size_t		resp_len;
656 	uint32_t	curr_off;
657 	caddr_t		curr_str;
658 	size_t		str_len;
659 	size_t		stat_len;
660 	int		nstat = req->msg_arg;
661 
662 	/*
663 	 * Calculate the size of the response message
664 	 * and allocate an appropriately sized buffer.
665 	 */
666 	resp_len = sizeof (dr_mem_hdr_t);
667 
668 	/* add the stat array size */
669 	stat_len = sizeof (dr_mem_stat_t) * nstat;
670 	resp_len += stat_len;
671 
672 	/* add the size of any error strings */
673 	for (idx = 0; idx < nstat; idx++) {
674 		if (res[idx].string != NULL) {
675 			resp_len += strlen(res[idx].string) + 1;
676 		}
677 	}
678 
679 	/* allocate the message buffer */
680 	resp = kmem_zalloc(resp_len, KM_SLEEP);
681 
682 	/*
683 	 * Fill in the header information.
684 	 */
685 	resp->req_num = req->req_num;
686 	resp->msg_type = DR_MEM_OK;
687 	resp->msg_arg = nstat;
688 
689 	/*
690 	 * Fill in the stat information.
691 	 */
692 	resp_stat = DR_MEM_RESP_STATS(resp);
693 
694 	/* string offsets start immediately after stat array */
695 	curr_off = sizeof (dr_mem_hdr_t) + stat_len;
696 	curr_str = (char *)resp_stat + stat_len;
697 
698 	for (idx = 0; idx < nstat; idx++) {
699 		resp_stat[idx].addr = res[idx].addr;
700 		resp_stat[idx].size = res[idx].size;
701 		resp_stat[idx].result = res[idx].result;
702 		resp_stat[idx].status = res[idx].status;
703 
704 		if (res[idx].string != NULL) {
705 			/* copy over the error string */
706 			str_len = strlen(res[idx].string) + 1;
707 			bcopy(res[idx].string, curr_str, str_len);
708 			resp_stat[idx].string_off = curr_off;
709 
710 			curr_off += str_len;
711 			curr_str += str_len;
712 		}
713 	}
714 
715 	/* buffer should be exactly filled */
716 	ASSERT(curr_off == resp_len);
717 
718 	*respp = resp;
719 	return (resp_len);
720 }
721 
722 static void
723 dr_mem_query(dr_mem_blk_t *mbp, dr_mem_query_t *mqp)
724 {
725 	memquery_t mq;
726 
727 	DR_DBG_MEM("dr_mem_query...\n");
728 
729 
730 	(void) kphysm_del_span_query(btop(mbp->addr), btop(mbp->size), &mq);
731 
732 	if (!mq.phys_pages)
733 		return;
734 
735 	mqp->addr = mbp->addr;
736 	mqp->mq.phys_pages = ptob(mq.phys_pages);
737 	mqp->mq.managed = ptob(mq.managed);
738 	mqp->mq.nonrelocatable = ptob(mq.nonrelocatable);
739 	mqp->mq.first_nonrelocatable = ptob(mq.first_nonrelocatable);
740 	mqp->mq.last_nonrelocatable = ptob(mq.last_nonrelocatable);
741 	/*
742 	 * Set to the max byte offset within the page.
743 	 */
744 	if (mqp->mq.nonrelocatable)
745 		mqp->mq.last_nonrelocatable += PAGESIZE - 1;
746 }
747 
748 /*
749  * Do not modify result buffer or length on error.
750  */
751 static int
752 dr_mem_list_query(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
753 {
754 	int		idx;
755 	int		rlen;
756 	int		nml;
757 	struct memlist	*ml;
758 	struct memlist	*phys_copy = NULL;
759 	dr_mem_blk_t	*req_mblks, mb;
760 	dr_mem_hdr_t	*rp;
761 	dr_mem_query_t	*stat;
762 
763 	drctl_block();
764 
765 	/* the incoming array of req_mblks to configure */
766 	req_mblks = DR_MEM_CMD_MBLKS(req);
767 
768 	/* allocate a response message, should be freed by caller */
769 	nml = 0;
770 	rlen = sizeof (dr_mem_hdr_t);
771 	if (req_mblks->addr == NULL && req_mblks->size == 0) {
772 		/*
773 		 * Request is for domain's full view of it's memory.
774 		 * place a copy in phys_copy then release the memlist lock.
775 		 */
776 		memlist_read_lock();
777 		phys_copy = dr_memlist_dup(phys_install);
778 		memlist_read_unlock();
779 
780 		for (ml = phys_copy; ml; ml = ml->ml_next)
781 			nml++;
782 
783 		rlen += nml * sizeof (dr_mem_query_t);
784 	} else {
785 		rlen += req->msg_arg * sizeof (dr_mem_query_t);
786 	}
787 	rp = kmem_zalloc(rlen, KM_SLEEP);
788 
789 	/* fill in the known data */
790 	rp->req_num = req->req_num;
791 	rp->msg_type = DR_MEM_OK;
792 	rp->msg_arg = nml ? nml : req->msg_arg;
793 
794 	/* stat array for the response */
795 	stat = DR_MEM_RESP_QUERY(rp);
796 
797 	/* get the status for each of the mblocks */
798 	if (nml) {
799 		for (idx = 0, ml = phys_copy; ml; ml = ml->ml_next, idx++) {
800 			mb.addr = ml->ml_address;
801 			mb.size = ml->ml_size;
802 			dr_mem_query(&mb, &stat[idx]);
803 		}
804 	} else {
805 		for (idx = 0; idx < req->msg_arg; idx++)
806 			dr_mem_query(&req_mblks[idx], &stat[idx]);
807 	}
808 
809 	*resp = rp;
810 	*resp_len = rlen;
811 	if (phys_copy != NULL) {
812 		dr_memlist_delete(phys_copy);
813 	}
814 	drctl_unblock();
815 
816 	return (0);
817 }
818 
819 static int
820 cvt_err(int err)
821 {
822 	int rv;
823 
824 	switch (err) {
825 	case KPHYSM_OK:
826 		rv = DR_MEM_RES_OK;
827 		break;
828 	case KPHYSM_ESPAN:
829 		rv = DR_MEM_RES_ESPAN;
830 		break;
831 	case KPHYSM_EFAULT:
832 		rv = DR_MEM_RES_EFAULT;
833 		break;
834 	case KPHYSM_ERESOURCE:
835 		rv = DR_MEM_RES_ERESOURCE;
836 		break;
837 	case KPHYSM_ENOTSUP:
838 	case KPHYSM_ENOHANDLES:
839 		rv = DR_MEM_RES_FAILURE;
840 		break;
841 	case KPHYSM_ENONRELOC:
842 		rv = DR_MEM_RES_PERM;
843 		break;
844 	case KPHYSM_EHANDLE:
845 		rv = DR_MEM_RES_FAILURE;
846 		break;
847 	case KPHYSM_EBUSY:
848 		rv = DR_MEM_RES_EBUSY;
849 		break;
850 	case KPHYSM_ENOTVIABLE:
851 		rv = DR_MEM_RES_ENOTVIABLE;
852 		break;
853 	case KPHYSM_ESEQUENCE:
854 		rv = DR_MEM_RES_FAILURE;
855 		break;
856 	case KPHYSM_ENOWORK:
857 		rv = DR_MEM_RES_ENOWORK;
858 		break;
859 	case KPHYSM_ECANCELLED:
860 		rv = DR_MEM_RES_ECANCELLED;
861 		break;
862 	case KPHYSM_EREFUSED:
863 		rv = DR_MEM_RES_EREFUSED;
864 		break;
865 	case KPHYSM_ENOTFINISHED:
866 	case KPHYSM_ENOTRUNNING:
867 		rv = DR_MEM_RES_FAILURE;
868 		break;
869 	case KPHYSM_EDUP:
870 		rv = DR_MEM_RES_EDUP;
871 		break;
872 	default:
873 		rv = DR_MEM_RES_FAILURE;
874 		break;
875 	}
876 
877 	return (rv);
878 }
879 
880 static int
881 dr_mem_configure(dr_mem_blk_t *mbp, int *status)
882 {
883 	int rv;
884 	uint64_t addr, size;
885 
886 	rv = 0;
887 	addr = mbp->addr;
888 	size = mbp->size;
889 
890 	DR_DBG_MEM("dr_mem_configure...\n");
891 
892 	if (!MBLK_IS_VALID(mbp)) {
893 		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n", addr, size);
894 		*status = DR_MEM_STAT_UNCONFIGURED;
895 		rv = DR_MEM_RES_EINVAL;
896 	} else if (rv = dr_mem_find(mbp)) {
897 		DR_DBG_MEM("failed to find mblk 0x%lx.0x%lx (%d)\n",
898 		    addr, size, rv);
899 		if (rv == EINVAL) {
900 			*status = DR_MEM_STAT_NOT_PRESENT;
901 			rv = DR_MEM_RES_NOT_IN_MD;
902 		} else {
903 			*status = DR_MEM_STAT_UNCONFIGURED;
904 			rv = DR_MEM_RES_FAILURE;
905 		}
906 	} else {
907 		rv = mem_add(btop(addr), btop(size));
908 		DR_DBG_MEM("addr=0x%lx size=0x%lx rv=%d\n", addr, size, rv);
909 		if (rv) {
910 			*status = DR_MEM_STAT_UNCONFIGURED;
911 		} else {
912 			*status = DR_MEM_STAT_CONFIGURED;
913 		}
914 	}
915 
916 	return (rv);
917 }
918 
919 static int
920 dr_mem_unconfigure(dr_mem_blk_t *mbp, int *status)
921 {
922 	int rv;
923 
924 	DR_DBG_MEM("dr_mem_unconfigure...\n");
925 
926 	if (!MBLK_IS_VALID(mbp)) {
927 		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n",
928 		    mbp->addr, mbp->size);
929 			*status = DR_MEM_STAT_CONFIGURED;
930 			rv = DR_MEM_RES_EINVAL;
931 	} else if (rv = mem_del(btop(mbp->addr), btop(mbp->size))) {
932 		*status = DR_MEM_STAT_CONFIGURED;
933 	} else {
934 		*status = DR_MEM_STAT_UNCONFIGURED;
935 		rv = DR_MEM_RES_OK;
936 		DR_DBG_MEM("mblk 0x%lx.0x%lx unconfigured\n",
937 		    mbp->addr, mbp->size);
938 	}
939 	return (rv);
940 }
941 
942 static int
943 dr_mem_del_stat(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
944 {
945 	int			status;
946 	int			rlen;
947 	memdelstat_t		del_stat, *stat;
948 	dr_mem_hdr_t		*rp;
949 
950 	/*
951 	 * If a mem delete is in progress, get its status.
952 	 */
953 	status = (dr_mh && (kphysm_del_status(dr_mh, &del_stat) == KPHYSM_OK));
954 
955 	/* allocate a response message, should be freed by caller */
956 	rlen = sizeof (dr_mem_hdr_t);
957 	rlen += status * sizeof (memdelstat_t);
958 	rp = kmem_zalloc(rlen, KM_SLEEP);
959 
960 	/* fill in the known data */
961 	rp->req_num = req->req_num;
962 	rp->msg_type = DR_MEM_OK;
963 	rp->msg_arg = status;
964 
965 	if (status) {
966 		/* stat struct for the response */
967 		stat = DR_MEM_RESP_DEL_STAT(rp);
968 		stat->phys_pages = ptob(del_stat.phys_pages);
969 		stat->managed = ptob(del_stat.managed);
970 		stat->collected = ptob(del_stat.collected);
971 	}
972 
973 	*resp = rp;
974 	*resp_len = rlen;
975 
976 	return (0);
977 }
978 
979 static int
980 dr_mem_del_cancel(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
981 {
982 	int		rlen;
983 	dr_mem_hdr_t	*rp;
984 
985 	/* allocate a response message, should be freed by caller */
986 	rlen = sizeof (dr_mem_hdr_t);
987 	rp = kmem_zalloc(rlen, KM_SLEEP);
988 
989 	/* fill in the known data */
990 	rp->req_num = req->req_num;
991 	rp->msg_type = DR_MEM_OK;
992 	rp->msg_arg = (dr_mh && kphysm_del_cancel(dr_mh) != KPHYSM_OK) ?
993 	    DR_MEM_RES_EINVAL : DR_MEM_RES_OK;
994 
995 	*resp = rp;
996 	*resp_len = rlen;
997 
998 	return (0);
999 }
1000 
1001 static int
1002 dr_mem_find(dr_mem_blk_t *mbp)
1003 {
1004 	md_t		*mdp = NULL;
1005 	int		num_nodes;
1006 	int		rv = 0;
1007 	int		listsz;
1008 	mde_cookie_t	*listp = NULL;
1009 	mde_cookie_t	memnode;
1010 	char		*found = "found";
1011 
1012 	if ((mdp = md_get_handle()) == NULL) {
1013 		DR_DBG_MEM("unable to initialize machine description\n");
1014 		return (-1);
1015 	}
1016 
1017 	num_nodes = md_node_count(mdp);
1018 	ASSERT(num_nodes > 0);
1019 
1020 	listsz = num_nodes * sizeof (mde_cookie_t);
1021 	listp = kmem_zalloc(listsz, KM_SLEEP);
1022 
1023 	memnode = dr_mem_find_node_md(mbp, mdp, listp);
1024 
1025 	if (memnode == MDE_INVAL_ELEM_COOKIE) {
1026 		rv = EINVAL;
1027 		found = "not found";
1028 	}
1029 
1030 	DR_DBG_MEM("mblk 0x%lx.0x%lx %s\n", mbp->addr, mbp->size, found);
1031 
1032 	kmem_free(listp, listsz);
1033 	(void) md_fini_handle(mdp);
1034 
1035 	return (rv);
1036 }
1037 
1038 /*
1039  * Look up a particular mblk in the MD. Returns the mde_cookie_t
1040  * representing that mblk if present, and MDE_INVAL_ELEM_COOKIE
1041  * otherwise. It is assumed the scratch array has already been
1042  * allocated so that it can accommodate the worst case scenario,
1043  * every node in the MD.
1044  */
1045 static mde_cookie_t
1046 dr_mem_find_node_md(dr_mem_blk_t *mbp, md_t *mdp, mde_cookie_t *listp)
1047 {
1048 	int		idx;
1049 	int		nnodes;
1050 	mde_cookie_t	rootnode;
1051 	uint64_t	base_prop;
1052 	uint64_t	size_prop;
1053 	mde_cookie_t	result = MDE_INVAL_ELEM_COOKIE;
1054 
1055 	rootnode = md_root_node(mdp);
1056 	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
1057 
1058 	/*
1059 	 * Scan the DAG for all the mem nodes
1060 	 */
1061 	nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "mblock"),
1062 	    md_find_name(mdp, "fwd"), listp);
1063 
1064 	if (nnodes < 0) {
1065 		DR_DBG_MEM("Scan for mblks failed\n");
1066 		return (result);
1067 	}
1068 
1069 	DR_DBG_MEM("dr_mem_find_node_md: found %d mblks in the MD\n", nnodes);
1070 
1071 	/*
1072 	 * Find the mblk of interest
1073 	 */
1074 	for (idx = 0; idx < nnodes; idx++) {
1075 
1076 		if (md_get_prop_val(mdp, listp[idx], "base", &base_prop)) {
1077 			DR_DBG_MEM("Missing 'base' property for mblk node %d\n",
1078 			    idx);
1079 			break;
1080 		}
1081 
1082 		if (md_get_prop_val(mdp, listp[idx], "size", &size_prop)) {
1083 			DR_DBG_MEM("Missing 'size' property for mblk node %d\n",
1084 			    idx);
1085 			break;
1086 		}
1087 
1088 		if (base_prop <= mbp->addr &&
1089 		    (base_prop + size_prop) >= (mbp->addr + mbp->size)) {
1090 			/* found a match */
1091 			DR_DBG_MEM("dr_mem_find_node_md: found mblk "
1092 			    "0x%lx.0x%lx in MD\n", mbp->addr, mbp->size);
1093 			result = listp[idx];
1094 			break;
1095 		}
1096 	}
1097 
1098 	if (result == MDE_INVAL_ELEM_COOKIE) {
1099 		DR_DBG_MEM("mblk 0x%lx.0x%lx not in MD\n",
1100 		    mbp->addr, mbp->size);
1101 	}
1102 
1103 	return (result);
1104 }
1105 
1106 static int
1107 mem_add(pfn_t base, pgcnt_t npgs)
1108 {
1109 	int rv, rc;
1110 
1111 	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);
1112 
1113 	if (npgs == 0)
1114 		return (DR_MEM_RES_OK);
1115 
1116 	rv = kphysm_add_memory_dynamic(base, npgs);
1117 	DR_DBG_MEM("%s: kphysm_add(0x%lx, 0x%lx) = %d", __func__, base, npgs,
1118 	    rv);
1119 	if (rv == KPHYSM_OK) {
1120 		if (rc = kcage_range_add(base, npgs, KCAGE_DOWN))
1121 			cmn_err(CE_WARN, "kcage_range_add() = %d", rc);
1122 	}
1123 	rv = cvt_err(rv);
1124 	return (rv);
1125 }
1126 
1127 static void
1128 del_done(void *arg, int error)
1129 {
1130 	mem_sync_t *ms = arg;
1131 
1132 	mutex_enter(&ms->lock);
1133 	ms->error = error;
1134 	ms->done = 1;
1135 	cv_signal(&ms->cond);
1136 	mutex_exit(&ms->lock);
1137 }
1138 
1139 static int
1140 mem_del(pfn_t base, pgcnt_t npgs)
1141 {
1142 	int rv, err, del_range = 0;
1143 	int convert = 1;
1144 	mem_sync_t ms;
1145 	memquery_t mq;
1146 	memhandle_t mh;
1147 	struct memlist *ml;
1148 	struct memlist *d_ml = NULL;
1149 
1150 	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);
1151 
1152 	if (npgs == 0)
1153 		return (DR_MEM_RES_OK);
1154 
1155 	if ((rv = kphysm_del_gethandle(&mh)) != KPHYSM_OK) {
1156 		cmn_err(CE_WARN, "%s: del_gethandle() = %d", __func__, rv);
1157 		rv = cvt_err(rv);
1158 		return (rv);
1159 	}
1160 	if ((rv = kphysm_del_span_query(base, npgs, &mq))
1161 	    != KPHYSM_OK) {
1162 		cmn_err(CE_WARN, "%s: del_span_query() = %d", __func__, rv);
1163 		goto done;
1164 	}
1165 	if (mq.nonrelocatable) {
1166 		DR_DBG_MEM("%s: non-reloc pages = %ld",
1167 		    __func__, mq.nonrelocatable);
1168 		rv  = KPHYSM_ENONRELOC;
1169 		goto done;
1170 	}
1171 	if (rv = kcage_range_delete(base, npgs)) {
1172 		switch (rv) {
1173 		case EBUSY:
1174 			rv = DR_MEM_RES_ENOTVIABLE;
1175 			break;
1176 		default:
1177 			rv = DR_MEM_RES_FAILURE;
1178 			break;
1179 		}
1180 		convert = 0; /* conversion done */
1181 		cmn_err(CE_WARN, "%s: del_range() = %d", __func__, rv);
1182 		goto done;
1183 	} else {
1184 		del_range++;
1185 	}
1186 	if ((rv = kphysm_del_span(mh, base, npgs)) != KPHYSM_OK) {
1187 		cmn_err(CE_WARN, "%s: del_span() = %d", __func__, rv);
1188 		goto done;
1189 	}
1190 	if ((rv = memlist_add_span(ptob(base), ptob(npgs), &d_ml))
1191 	    != MEML_SPANOP_OK) {
1192 		switch (rv) {
1193 		case MEML_SPANOP_ESPAN:
1194 			rv = DR_MEM_RES_ESPAN;
1195 			break;
1196 		case MEML_SPANOP_EALLOC:
1197 			rv = DR_MEM_RES_ERESOURCE;
1198 			break;
1199 		default:
1200 			rv = DR_MEM_RES_FAILURE;
1201 			break;
1202 		}
1203 		convert = 0; /* conversion done */
1204 		cmn_err(CE_WARN, "%s: add_span() = %d", __func__, rv);
1205 		goto done;
1206 	}
1207 
1208 	DR_DBG_MEM("%s: reserved=0x%lx", __func__, npgs);
1209 
1210 	bzero((void *) &ms, sizeof (ms));
1211 
1212 	mutex_init(&ms.lock, NULL, MUTEX_DRIVER, NULL);
1213 	cv_init(&ms.cond, NULL, CV_DRIVER, NULL);
1214 	mutex_enter(&ms.lock);
1215 
1216 	if ((rv = kphysm_del_start(mh, del_done, (void *) &ms)) == KPHYSM_OK) {
1217 		/*
1218 		 * Since we've called drctl_config_init, we are the only
1219 		 * DR ctl operation in progress.  Set dr_mh to the
1220 		 * delete memhandle for use by stat and cancel.
1221 		 */
1222 		ASSERT(dr_mh == NULL);
1223 		dr_mh = mh;
1224 
1225 		/*
1226 		 * Wait for completion or interrupt.
1227 		 */
1228 		while (!ms.done) {
1229 			if (cv_wait_sig(&ms.cond, &ms.lock) == 0) {
1230 				/*
1231 				 * There is a pending signal.
1232 				 */
1233 				(void) kphysm_del_cancel(mh);
1234 				DR_DBG_MEM("%s: cancel", __func__);
1235 				/*
1236 				 * Wait for completion.
1237 				 */
1238 				while (!ms.done)
1239 					cv_wait(&ms.cond, &ms.lock);
1240 			}
1241 		}
1242 		dr_mh = NULL;
1243 		rv = ms.error;
1244 	} else {
1245 		DR_DBG_MEM("%s: del_start() = %d", __func__, rv);
1246 	}
1247 
1248 	mutex_exit(&ms.lock);
1249 	cv_destroy(&ms.cond);
1250 	mutex_destroy(&ms.lock);
1251 
1252 done:
1253 	if (rv && del_range) {
1254 		/*
1255 		 * Add back the spans to the kcage growth list.
1256 		 */
1257 		for (ml = d_ml; ml; ml = ml->ml_next)
1258 			if (err = kcage_range_add(btop(ml->ml_address),
1259 			    btop(ml->ml_size), KCAGE_DOWN))
1260 				cmn_err(CE_WARN, "kcage_range_add() = %d", err);
1261 	}
1262 	memlist_free_list(d_ml);
1263 
1264 	if ((err = kphysm_del_release(mh)) != KPHYSM_OK)
1265 		cmn_err(CE_WARN, "%s: del_release() = %d", __func__, err);
1266 	if (convert)
1267 		rv = cvt_err(rv);
1268 
1269 	DR_DBG_MEM("%s: rv=%d", __func__, rv);
1270 
1271 	return (rv);
1272 }
1273