xref: /illumos-gate/usr/src/uts/sun4v/io/dr_mem.c (revision 28bda19c304ae9f3ffa10394ef34c6e8f9e4c5f5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * sun4v Memory DR Module
29  */
30 
31 
32 #include <sys/types.h>
33 #include <sys/cmn_err.h>
34 #include <sys/vmem.h>
35 #include <sys/kmem.h>
36 #include <sys/systm.h>
37 #include <sys/machsystm.h>	/* for page_freelist_coalesce() */
38 #include <sys/errno.h>
39 #include <sys/memnode.h>
40 #include <sys/memlist.h>
41 #include <sys/memlist_impl.h>
42 #include <sys/tuneable.h>
43 #include <sys/proc.h>
44 #include <sys/disp.h>
45 #include <sys/debug.h>
46 #include <sys/vm.h>
47 #include <sys/callb.h>
48 #include <sys/memlist_plat.h>	/* for installed_top_size() */
49 #include <sys/condvar_impl.h>	/* for CV_HAS_WAITERS() */
50 #include <sys/dumphdr.h>	/* for dump_resize() */
51 #include <sys/atomic.h>		/* for use in stats collection */
52 #include <sys/rwlock.h>
53 #include <vm/seg_kmem.h>
54 #include <vm/seg_kpm.h>
55 #include <vm/page.h>
56 #include <vm/vm_dep.h>
57 #define	SUNDDI_IMPL		/* so sunddi.h will not redefine splx() et al */
58 #include <sys/sunddi.h>
59 #include <sys/mem_config.h>
60 #include <sys/mem_cage.h>
61 #include <sys/lgrp.h>
62 #include <sys/ddi.h>
63 
64 #include <sys/modctl.h>
65 #include <sys/sysevent/dr.h>
66 #include <sys/mach_descrip.h>
67 #include <sys/mdesc.h>
68 #include <sys/ds.h>
69 #include <sys/drctl.h>
70 #include <sys/dr_util.h>
71 #include <sys/dr_mem.h>
72 
73 
74 /*
75  * DR operations are subject to Memory Alignment restrictions
76  * for both address and the size of the request.
77  */
78 #define	MA_ADDR	0x10000000	/* addr alignment 256M */
79 #define	MA_SIZE	0x10000000	/* size alignment 256M */
80 
81 #define	MBLK_IS_VALID(m) \
82 	(IS_P2ALIGNED((m)->addr, MA_ADDR) && IS_P2ALIGNED((m)->size, MA_SIZE))
83 
84 static memhandle_t dr_mh;	/* memory handle for delete */
85 
86 static struct modlmisc modlmisc = {
87 	&mod_miscops,
88 	"sun4v memory DR"
89 };
90 
91 static struct modlinkage modlinkage = {
92 	MODREV_1,
93 	(void *)&modlmisc,
94 	NULL
95 };
96 
97 static int dr_mem_allow_unload = 0;
98 
99 typedef int (*fn_t)(dr_mem_blk_t *, int *);
100 
101 /*
102  * Global Domain Services (DS) Handle
103  */
104 static ds_svc_hdl_t ds_handle;
105 
106 /*
107  * Supported DS Capability Versions
108  */
109 static ds_ver_t		dr_mem_vers[] = { { 1, 0 } };
110 #define	DR_MEM_NVERS	(sizeof (dr_mem_vers) / sizeof (dr_mem_vers[0]))
111 
112 /*
113  * DS Capability Description
114  */
115 static ds_capability_t dr_mem_cap = {
116 	DR_MEM_DS_ID,		/* svc_id */
117 	dr_mem_vers,		/* vers */
118 	DR_MEM_NVERS		/* nvers */
119 };
120 
121 /*
122  * DS Callbacks
123  */
124 static void dr_mem_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
125 static void dr_mem_unreg_handler(ds_cb_arg_t arg);
126 static void dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
127 
128 /*
129  * DS Client Ops Vector
130  */
131 static ds_clnt_ops_t dr_mem_ops = {
132 	dr_mem_reg_handler,	/* ds_reg_cb */
133 	dr_mem_unreg_handler,	/* ds_unreg_cb */
134 	dr_mem_data_handler,	/* ds_data_cb */
135 	NULL			/* cb_arg */
136 };
137 
138 /*
139  * Operation Results
140  *
141  * Used internally to gather results while an operation on a
142  * list of mblks is in progress. In particular, it is used to
143  * keep track of which mblks have already failed so that they are
144  * not processed further, and the manner in which they failed.
145  */
146 typedef struct {
147 	uint64_t	addr;
148 	uint64_t	size;
149 	uint32_t	result;
150 	uint32_t	status;
151 	char		*string;
152 } dr_mem_res_t;
153 
154 static char *
155 dr_mem_estr[] = {
156 	"operation succeeded",		/* DR_MEM_RES_OK */
157 	"operation failed",		/* DR_MEM_RES_FAILURE */
158 	"operation was blocked",	/* DR_MEM_RES_BLOCKED */
159 	"memory not defined in MD",	/* DR_MEM_RES_NOT_IN_MD */
160 	"memory already in use",	/* DR_MEM_RES_ESPAN */
161 	"memory access test failed",	/* DR_MEM_RES_EFAULT */
162 	"resource not available",	/* DR_MEM_RES_ERESOURCE */
163 	"permanent pages in span",	/* DR_MEM_RES_PERM */
164 	"memory span busy",		/* DR_MEM_RES_EBUSY */
165 	"VM viability test failed",	/* DR_MEM_RES_ENOTVIABLE */
166 	"no pages to unconfigure",	/* DR_MEM_RES_ENOWORK */
167 	"operation cancelled",		/* DR_MEM_RES_ECANCELLED */
168 	"operation refused",		/* DR_MEM_RES_EREFUSED */
169 	"memory span duplicate",	/* DR_MEM_RES_EDUP */
170 	"invalid argument"		/* DR_MEM_RES_EINVAL */
171 };
172 
173 typedef struct {
174 	kcondvar_t cond;
175 	kmutex_t lock;
176 	int error;
177 	int done;
178 } mem_sync_t;
179 
180 /*
181  * Internal Functions
182  */
183 static int dr_mem_init(void);
184 static int dr_mem_fini(void);
185 
186 static int dr_mem_list_wrk(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
187 static int dr_mem_list_query(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
188 static int dr_mem_del_stat(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
189 static int dr_mem_del_cancel(dr_mem_hdr_t *, dr_mem_hdr_t **, int *);
190 
191 static int dr_mem_unconfigure(dr_mem_blk_t *, int *);
192 static int dr_mem_configure(dr_mem_blk_t *, int *);
193 static void dr_mem_query(dr_mem_blk_t *, dr_mem_query_t *);
194 
195 static dr_mem_res_t *dr_mem_res_array_init(dr_mem_hdr_t *, drctl_rsrc_t *, int);
196 static void dr_mem_res_array_fini(dr_mem_res_t *res, int nres);
197 static size_t dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res,
198     dr_mem_hdr_t **respp);
199 
200 static int dr_mem_find(dr_mem_blk_t *mbp);
201 static mde_cookie_t dr_mem_find_node_md(dr_mem_blk_t *, md_t *, mde_cookie_t *);
202 
203 static int mem_add(pfn_t, pgcnt_t);
204 static int mem_del(pfn_t, pgcnt_t);
205 
206 static size_t rsvaddsz;
207 extern void i_dr_mem_init(uint64_t *);
208 extern void i_dr_mem_fini();
209 extern void i_dr_mem_update();
210 extern int kphysm_add_memory_dynamic(pfn_t, pgcnt_t);
211 
212 int
213 _init(void)
214 {
215 	int	status;
216 
217 	/* check that Memory DR is enabled */
218 	if (dr_is_disabled(DR_TYPE_MEM))
219 		return (ENOTSUP);
220 
221 	if ((status = dr_mem_init()) != 0) {
222 		cmn_err(CE_NOTE, "Memory DR initialization failed");
223 		return (status);
224 	}
225 
226 	if ((status = mod_install(&modlinkage)) != 0) {
227 		(void) dr_mem_fini();
228 	}
229 
230 	return (status);
231 }
232 
233 int
234 _info(struct modinfo *modinfop)
235 {
236 	return (mod_info(&modlinkage, modinfop));
237 }
238 
239 int
240 _fini(void)
241 {
242 	int	status;
243 
244 	if (dr_mem_allow_unload == 0)
245 		return (EBUSY);
246 
247 	if ((status = mod_remove(&modlinkage)) == 0) {
248 		(void) dr_mem_fini();
249 	}
250 
251 	return (status);
252 }
253 
254 static int
255 dr_mem_init(void)
256 {
257 	int rv;
258 
259 	if ((rv = ds_cap_init(&dr_mem_cap, &dr_mem_ops)) != 0) {
260 		cmn_err(CE_NOTE, "dr_mem: ds_cap_init failed: %d", rv);
261 		return (rv);
262 	}
263 
264 	i_dr_mem_init(&rsvaddsz);
265 
266 	return (0);
267 }
268 
269 static int
270 dr_mem_fini(void)
271 {
272 	int rv;
273 
274 	i_dr_mem_fini();
275 
276 	if ((rv = ds_cap_fini(&dr_mem_cap)) != 0) {
277 		cmn_err(CE_NOTE, "dr_mem: ds_cap_fini failed: %d", rv);
278 	}
279 
280 	return (rv);
281 }
282 
283 static void
284 dr_mem_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
285 {
286 	DR_DBG_MEM("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg,
287 	    ver->major, ver->minor, hdl);
288 
289 	ds_handle = hdl;
290 }
291 
292 static void
293 dr_mem_unreg_handler(ds_cb_arg_t arg)
294 {
295 	DR_DBG_MEM("unreg_handler: arg=0x%p\n", arg);
296 
297 	ds_handle = DS_INVALID_HDL;
298 }
299 
300 /*ARGSUSED*/
301 static void
302 dr_mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
303 {
304 	dr_mem_hdr_t	*req = buf;
305 	dr_mem_hdr_t	err_resp;
306 	dr_mem_hdr_t	*resp = &err_resp;
307 	int		resp_len = 0;
308 	int		rv = EINVAL;
309 
310 	/*
311 	 * Sanity check the message
312 	 */
313 	if (buflen < sizeof (dr_mem_hdr_t)) {
314 		DR_DBG_MEM("incoming message short: expected at least %ld "
315 		    "bytes, received %ld\n", sizeof (dr_mem_hdr_t), buflen);
316 		goto done;
317 	}
318 
319 	if (req == NULL) {
320 		DR_DBG_MEM("empty message: expected at least %ld bytes\n",
321 		    sizeof (dr_mem_hdr_t));
322 		goto done;
323 	}
324 
325 	DR_DBG_MEM("incoming request:\n");
326 	DR_DBG_DUMP_MSG(buf, buflen);
327 
328 	/*
329 	 * Process the command
330 	 */
331 	switch (req->msg_type) {
332 	case DR_MEM_CONFIGURE:
333 	case DR_MEM_UNCONFIGURE:
334 		if (req->msg_arg == 0) {
335 			DR_DBG_MEM("No mblks specified for operation\n");
336 			goto done;
337 		}
338 		if ((rv = dr_mem_list_wrk(req, &resp, &resp_len)) != 0) {
339 			DR_DBG_MEM("%s failed (%d)\n",
340 			    (req->msg_type == DR_MEM_CONFIGURE) ?
341 			    "Memory configure" : "Memory unconfigure", rv);
342 		}
343 		break;
344 
345 	case DR_MEM_UNCONF_STATUS:
346 		if ((rv = dr_mem_del_stat(req, &resp, &resp_len)) != 0)
347 			DR_DBG_MEM("Memory delete status failed (%d)\n", rv);
348 		break;
349 
350 	case DR_MEM_UNCONF_CANCEL:
351 		if ((rv = dr_mem_del_cancel(req, &resp, &resp_len)) != 0)
352 			DR_DBG_MEM("Memory delete cancel failed (%d)\n", rv);
353 		break;
354 
355 	case DR_MEM_QUERY:
356 		if (req->msg_arg == 0) {
357 			DR_DBG_MEM("No mblks specified for operation\n");
358 			goto done;
359 		}
360 		if ((rv = dr_mem_list_query(req, &resp, &resp_len)) != 0)
361 			DR_DBG_MEM("Memory query failed (%d)\n", rv);
362 		break;
363 
364 	default:
365 		cmn_err(CE_NOTE, "unsupported memory DR operation (%d)",
366 		    req->msg_type);
367 		break;
368 	}
369 
370 done:
371 	/* check if an error occurred */
372 	if (resp == &err_resp) {
373 		resp->req_num = (req) ? req->req_num : 0;
374 		resp->msg_type = DR_MEM_ERROR;
375 		resp->msg_arg = rv;
376 		resp_len = sizeof (dr_mem_hdr_t);
377 	}
378 
379 	DR_DBG_MEM("outgoing response:\n");
380 	DR_DBG_DUMP_MSG(resp, resp_len);
381 
382 	/* send back the response */
383 	if (ds_cap_send(ds_handle, resp, resp_len) != 0) {
384 		DR_DBG_MEM("ds_send failed\n");
385 	}
386 
387 	/* free any allocated memory */
388 	if (resp != &err_resp) {
389 		kmem_free(resp, resp_len);
390 	}
391 }
392 
393 /*
394  * Common routine to config or unconfig multiple mblks.
395  *
396  * Note: Do not modify result buffer or length on error.
397  */
398 static int
399 dr_mem_list_wrk(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
400 {
401 	int		rv;
402 	int		idx;
403 	int		count;
404 	int		result;
405 	int		status;
406 	fn_t		dr_fn;
407 	int		se_hint;
408 	dr_mem_blk_t	*req_mblks;
409 	dr_mem_res_t	*res;
410 	int		drctl_cmd;
411 	int		drctl_flags = 0;
412 	drctl_rsrc_t	*drctl_req;
413 	size_t		drctl_req_len;
414 	drctl_resp_t	*drctl_resp;
415 	drctl_rsrc_t	*drctl_rsrc;
416 	size_t		drctl_resp_len = 0;
417 	drctl_cookie_t	drctl_res_ck;
418 
419 	ASSERT((req != NULL) && (req->msg_arg != 0));
420 
421 	count = req->msg_arg;
422 
423 	/*
424 	 * Extract all information that is specific
425 	 * to the various types of operations.
426 	 */
427 	switch (req->msg_type) {
428 	case DR_MEM_CONFIGURE:
429 		dr_fn = dr_mem_configure;
430 		drctl_cmd = DRCTL_MEM_CONFIG_REQUEST;
431 		se_hint = SE_HINT_INSERT;
432 		break;
433 	case DR_MEM_UNCONFIGURE:
434 		dr_fn = dr_mem_unconfigure;
435 		drctl_cmd = DRCTL_MEM_UNCONFIG_REQUEST;
436 		se_hint = SE_HINT_REMOVE;
437 		break;
438 	default:
439 		/* Programming error if we reach this. */
440 		cmn_err(CE_NOTE, "%s: bad msg_type %d\n",
441 		    __func__, req->msg_type);
442 		ASSERT(0);
443 		return (-1);
444 	}
445 
446 	/* the incoming array of mblks to operate on */
447 	req_mblks = DR_MEM_CMD_MBLKS(req);
448 
449 	/* allocate drctl request msg based on incoming resource count */
450 	drctl_req_len = sizeof (drctl_rsrc_t) * count;
451 	drctl_req = kmem_zalloc(drctl_req_len, KM_SLEEP);
452 
453 	/* copy the size for the drctl call from the incoming request msg */
454 	for (idx = 0; idx < count; idx++) {
455 		drctl_req[idx].res_mem_addr = req_mblks[idx].addr;
456 		drctl_req[idx].res_mem_size = req_mblks[idx].size;
457 	}
458 
459 	rv = drctl_config_init(drctl_cmd, drctl_flags, drctl_req,
460 	    count, &drctl_resp, &drctl_resp_len, &drctl_res_ck);
461 
462 	ASSERT((drctl_resp != NULL) && (drctl_resp_len != 0));
463 
464 	if (rv != 0) {
465 		DR_DBG_MEM("%s: drctl_config_init returned: %d\n",
466 		    __func__, rv);
467 		kmem_free(drctl_resp, drctl_resp_len);
468 		kmem_free(drctl_req, drctl_req_len);
469 		return (rv);
470 	}
471 
472 	ASSERT(drctl_resp->resp_type == DRCTL_RESP_OK);
473 
474 	drctl_rsrc = drctl_resp->resp_resources;
475 
476 	/* create the result scratch array */
477 	res = dr_mem_res_array_init(req, drctl_rsrc, count);
478 
479 	/* perform the specified operation on each of the mblks */
480 	for (idx = 0; idx < count; idx++) {
481 		/*
482 		 * If no action will be taken against the current
483 		 * mblk, update the drctl resource information to
484 		 * ensure that it gets recovered properly during
485 		 * the drctl fini() call.
486 		 */
487 		if (res[idx].result != DR_MEM_RES_OK) {
488 			drctl_req[idx].status = DRCTL_STATUS_CONFIG_FAILURE;
489 			continue;
490 		}
491 
492 		/* call the function to perform the actual operation */
493 		result = (*dr_fn)(&req_mblks[idx], &status);
494 
495 		/* save off results of the operation */
496 		res[idx].result = result;
497 		res[idx].status = status;
498 		res[idx].addr = req_mblks[idx].addr;	/* for partial case */
499 		res[idx].size = req_mblks[idx].size;	/* for partial case */
500 		res[idx].string = i_ddi_strdup(dr_mem_estr[result], KM_SLEEP);
501 
502 		/* save result for drctl fini() reusing init() msg memory */
503 		drctl_req[idx].status = (result != DR_MEM_RES_OK) ?
504 		    DRCTL_STATUS_CONFIG_FAILURE : DRCTL_STATUS_CONFIG_SUCCESS;
505 
506 		DR_DBG_MEM("%s: mblk 0x%lx.0x%lx stat %d result %d off '%s'\n",
507 		    __func__, req_mblks[idx].addr, req_mblks[idx].size,
508 		    drctl_req[idx].status, result,
509 		    (res[idx].string) ? res[idx].string : "");
510 	}
511 
512 	if ((rv = drctl_config_fini(&drctl_res_ck, drctl_req, count)) != 0)
513 		DR_DBG_MEM("%s: drctl_config_fini returned: %d\n",
514 		    __func__, rv);
515 
516 	/*
517 	 * Operation completed without any fatal errors.
518 	 * Pack the response for transmission.
519 	 */
520 	*resp_len = dr_mem_pack_response(req, res, resp);
521 
522 	/* notify interested parties about the operation */
523 	dr_generate_event(DR_TYPE_MEM, se_hint);
524 
525 	/*
526 	 * Deallocate any scratch memory.
527 	 */
528 	kmem_free(drctl_resp, drctl_resp_len);
529 	kmem_free(drctl_req, drctl_req_len);
530 
531 	dr_mem_res_array_fini(res, count);
532 
533 	return (0);
534 }
535 
536 /*
537  * Allocate and initialize a result array based on the initial
538  * drctl operation. A valid result array is always returned.
539  */
540 static dr_mem_res_t *
541 dr_mem_res_array_init(dr_mem_hdr_t *req, drctl_rsrc_t *rsrc, int nrsrc)
542 {
543 	int		idx;
544 	dr_mem_res_t	*res;
545 	char		*err_str;
546 	size_t		err_len;
547 
548 	/* allocate zero filled buffer to initialize fields */
549 	res = kmem_zalloc(nrsrc * sizeof (dr_mem_res_t), KM_SLEEP);
550 
551 	/*
552 	 * Fill in the result information for each resource.
553 	 */
554 	for (idx = 0; idx < nrsrc; idx++) {
555 		res[idx].addr = rsrc[idx].res_mem_addr;
556 		res[idx].size = rsrc[idx].res_mem_size;
557 		res[idx].result = DR_MEM_RES_OK;
558 
559 		if (rsrc[idx].status == DRCTL_STATUS_ALLOW)
560 			continue;
561 
562 		/*
563 		 * Update the state information for this mblk.
564 		 */
565 		res[idx].result = DR_MEM_RES_BLOCKED;
566 		res[idx].status = (req->msg_type == DR_MEM_CONFIGURE) ?
567 		    DR_MEM_STAT_UNCONFIGURED : DR_MEM_STAT_CONFIGURED;
568 
569 		/*
570 		 * If an error string exists, copy it out of the
571 		 * message buffer. This eliminates any dependency
572 		 * on the memory allocated for the message buffer
573 		 * itself.
574 		 */
575 		if (rsrc[idx].offset != NULL) {
576 			err_str = (char *)rsrc + rsrc[idx].offset;
577 			err_len = strlen(err_str) + 1;
578 
579 			res[idx].string = kmem_alloc(err_len, KM_SLEEP);
580 			bcopy(err_str, res[idx].string, err_len);
581 		}
582 	}
583 
584 	return (res);
585 }
586 
587 static void
588 dr_mem_res_array_fini(dr_mem_res_t *res, int nres)
589 {
590 	int	idx;
591 	size_t	str_len;
592 
593 	for (idx = 0; idx < nres; idx++) {
594 		/* deallocate the error string if present */
595 		if (res[idx].string) {
596 			str_len = strlen(res[idx].string) + 1;
597 			kmem_free(res[idx].string, str_len);
598 		}
599 	}
600 
601 	/* deallocate the result array itself */
602 	kmem_free(res, sizeof (dr_mem_res_t) * nres);
603 }
604 
605 /*
606  * Allocate and pack a response message for transmission based
607  * on the specified result array. A valid response message and
608  * valid size information is always returned.
609  */
610 static size_t
611 dr_mem_pack_response(dr_mem_hdr_t *req, dr_mem_res_t *res, dr_mem_hdr_t **respp)
612 {
613 	int		idx;
614 	dr_mem_hdr_t	*resp;
615 	dr_mem_stat_t	*resp_stat;
616 	size_t		resp_len;
617 	uint32_t	curr_off;
618 	caddr_t		curr_str;
619 	size_t		str_len;
620 	size_t		stat_len;
621 	int		nstat = req->msg_arg;
622 
623 	/*
624 	 * Calculate the size of the response message
625 	 * and allocate an appropriately sized buffer.
626 	 */
627 	resp_len = sizeof (dr_mem_hdr_t);
628 
629 	/* add the stat array size */
630 	stat_len = sizeof (dr_mem_stat_t) * nstat;
631 	resp_len += stat_len;
632 
633 	/* add the size of any error strings */
634 	for (idx = 0; idx < nstat; idx++) {
635 		if (res[idx].string != NULL) {
636 			resp_len += strlen(res[idx].string) + 1;
637 		}
638 	}
639 
640 	/* allocate the message buffer */
641 	resp = kmem_zalloc(resp_len, KM_SLEEP);
642 
643 	/*
644 	 * Fill in the header information.
645 	 */
646 	resp->req_num = req->req_num;
647 	resp->msg_type = DR_MEM_OK;
648 	resp->msg_arg = nstat;
649 
650 	/*
651 	 * Fill in the stat information.
652 	 */
653 	resp_stat = DR_MEM_RESP_STATS(resp);
654 
655 	/* string offsets start immediately after stat array */
656 	curr_off = sizeof (dr_mem_hdr_t) + stat_len;
657 	curr_str = (char *)resp_stat + stat_len;
658 
659 	for (idx = 0; idx < nstat; idx++) {
660 		resp_stat[idx].addr = res[idx].addr;
661 		resp_stat[idx].size = res[idx].size;
662 		resp_stat[idx].result = res[idx].result;
663 		resp_stat[idx].status = res[idx].status;
664 
665 		if (res[idx].string != NULL) {
666 			/* copy over the error string */
667 			str_len = strlen(res[idx].string) + 1;
668 			bcopy(res[idx].string, curr_str, str_len);
669 			resp_stat[idx].string_off = curr_off;
670 
671 			curr_off += str_len;
672 			curr_str += str_len;
673 		}
674 	}
675 
676 	/* buffer should be exactly filled */
677 	ASSERT(curr_off == resp_len);
678 
679 	*respp = resp;
680 	return (resp_len);
681 }
682 
683 static void
684 dr_mem_query(dr_mem_blk_t *mbp, dr_mem_query_t *mqp)
685 {
686 	memquery_t mq;
687 
688 	DR_DBG_MEM("dr_mem_query...\n");
689 
690 
691 	(void) kphysm_del_span_query(btop(mbp->addr), btop(mbp->size), &mq);
692 
693 	if (!mq.phys_pages)
694 		return;
695 
696 	mqp->addr = mbp->addr;
697 	mqp->mq.phys_pages = ptob(mq.phys_pages);
698 	mqp->mq.managed = ptob(mq.managed);
699 	mqp->mq.nonrelocatable = ptob(mq.nonrelocatable);
700 	mqp->mq.first_nonrelocatable = ptob(mq.first_nonrelocatable);
701 	mqp->mq.last_nonrelocatable = ptob(mq.last_nonrelocatable);
702 	/*
703 	 * Set to the max byte offset within the page.
704 	 */
705 	if (mqp->mq.nonrelocatable)
706 		mqp->mq.last_nonrelocatable += PAGESIZE - 1;
707 }
708 
709 /*
710  * Do not modify result buffer or length on error.
711  */
712 static int
713 dr_mem_list_query(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
714 {
715 	int		idx;
716 	int		rlen;
717 	int		nml;
718 	struct memlist	*ml;
719 	dr_mem_blk_t	*req_mblks, mb;
720 	dr_mem_hdr_t	*rp;
721 	dr_mem_query_t	*stat;
722 
723 	/* the incoming array of req_mblks to configure */
724 	req_mblks = DR_MEM_CMD_MBLKS(req);
725 
726 	/* allocate a response message, should be freed by caller */
727 	nml = 0;
728 	rlen = sizeof (dr_mem_hdr_t);
729 	if (req_mblks->addr == NULL && req_mblks->size == 0) {
730 		/*
731 		 * Request is for domain's full view of it's memory.
732 		 */
733 		memlist_read_lock();
734 		for (ml = phys_install; ml; ml = ml->next)
735 			nml++;
736 
737 		rlen += nml * sizeof (dr_mem_query_t);
738 	} else {
739 		rlen += req->msg_arg * sizeof (dr_mem_query_t);
740 	}
741 	rp = kmem_zalloc(rlen, KM_SLEEP);
742 
743 	/* fill in the known data */
744 	rp->req_num = req->req_num;
745 	rp->msg_type = DR_MEM_OK;
746 	rp->msg_arg = nml ? nml : req->msg_arg;
747 
748 	/* stat array for the response */
749 	stat = DR_MEM_RESP_QUERY(rp);
750 
751 	/* get the status for each of the mblocks */
752 	if (nml) {
753 		for (idx = 0, ml = phys_install; ml; ml = ml->next, idx++) {
754 			mb.addr = ml->address;
755 			mb.size = ml->size;
756 			dr_mem_query(&mb, &stat[idx]);
757 		}
758 		memlist_read_unlock();
759 	} else {
760 		for (idx = 0; idx < req->msg_arg; idx++)
761 			dr_mem_query(&req_mblks[idx], &stat[idx]);
762 	}
763 
764 	*resp = rp;
765 	*resp_len = rlen;
766 
767 	return (0);
768 }
769 
770 static int
771 cvt_err(int err)
772 {
773 	int rv;
774 
775 	switch (err) {
776 	case KPHYSM_OK:
777 		rv = DR_MEM_RES_OK;
778 		break;
779 	case KPHYSM_ESPAN:
780 		rv = DR_MEM_RES_ESPAN;
781 		break;
782 	case KPHYSM_EFAULT:
783 		rv = DR_MEM_RES_EFAULT;
784 		break;
785 	case KPHYSM_ERESOURCE:
786 		rv = DR_MEM_RES_ERESOURCE;
787 		break;
788 	case KPHYSM_ENOTSUP:
789 	case KPHYSM_ENOHANDLES:
790 		rv = DR_MEM_RES_FAILURE;
791 		break;
792 	case KPHYSM_ENONRELOC:
793 		rv = DR_MEM_RES_PERM;
794 		break;
795 	case KPHYSM_EHANDLE:
796 		rv = DR_MEM_RES_FAILURE;
797 		break;
798 	case KPHYSM_EBUSY:
799 		rv = DR_MEM_RES_EBUSY;
800 		break;
801 	case KPHYSM_ENOTVIABLE:
802 		rv = DR_MEM_RES_ENOTVIABLE;
803 		break;
804 	case KPHYSM_ESEQUENCE:
805 		rv = DR_MEM_RES_FAILURE;
806 		break;
807 	case KPHYSM_ENOWORK:
808 		rv = DR_MEM_RES_ENOWORK;
809 		break;
810 	case KPHYSM_ECANCELLED:
811 		rv = DR_MEM_RES_ECANCELLED;
812 		break;
813 	case KPHYSM_EREFUSED:
814 		rv = DR_MEM_RES_EREFUSED;
815 		break;
816 	case KPHYSM_ENOTFINISHED:
817 	case KPHYSM_ENOTRUNNING:
818 		rv = DR_MEM_RES_FAILURE;
819 		break;
820 	case KPHYSM_EDUP:
821 		rv = DR_MEM_RES_EDUP;
822 		break;
823 	default:
824 		rv = DR_MEM_RES_FAILURE;
825 		break;
826 	}
827 
828 	return (rv);
829 }
830 
831 static int
832 dr_mem_configure(dr_mem_blk_t *mbp, int *status)
833 {
834 	int rv;
835 	uint64_t addr, size, addsz;
836 
837 	rv = 0;
838 	addr = mbp->addr;
839 	size = mbp->size;
840 
841 	DR_DBG_MEM("dr_mem_configure...\n");
842 
843 	if (!MBLK_IS_VALID(mbp)) {
844 		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n", addr, size);
845 		*status = DR_MEM_STAT_UNCONFIGURED;
846 		rv = DR_MEM_RES_EINVAL;
847 	} else if (rv = dr_mem_find(mbp)) {
848 		DR_DBG_MEM("failed to find mblk 0x%lx.0x%lx (%d)\n",
849 		    addr, size, rv);
850 		if (rv == EINVAL) {
851 			*status = DR_MEM_STAT_NOT_PRESENT;
852 			rv = DR_MEM_RES_NOT_IN_MD;
853 		} else {
854 			*status = DR_MEM_STAT_UNCONFIGURED;
855 			rv = DR_MEM_RES_FAILURE;
856 		}
857 	} else if (rsvaddsz) {
858 		addr += size;
859 
860 		/*
861 		 * Add up to the first <rsvaddsz> portion of mblock
862 		 * first since that portion has reserved meta pages.
863 		 * This will likely guarantee an additional amount of
864 		 * free pages from which we may have to allocate the
865 		 * rest of the meta pages.
866 		 *
867 		 * Break up the request in descending order (if needed)
868 		 * in order to ensure that cage grows from the high end
869 		 * of the original request.
870 		 */
871 		for (addsz = MIN(size, rsvaddsz); addsz > 0; addsz = size) {
872 			ASSERT(addr >= mbp->addr);
873 			DR_DBG_MEM("addsz=0x%lx  size=0x%lx\n", addsz, size);
874 			if (rv = mem_add(btop(addr - addsz), btop(addsz))) {
875 				DR_DBG_MEM("failed to configure span"
876 				    " 0x%lx.0x%lx (%d)\n", addr, addsz, rv);
877 				break;
878 			} else {
879 				size -= addsz;
880 				addr -= addsz;
881 			}
882 		}
883 
884 		/*
885 		 * Mark the mblock configured if any span
886 		 * in that mblock was successfully added.
887 		 *
888 		 * In case of partial success:
889 		 *
890 		 *	rv != DR_MEM_RES_OK
891 		 *	status == DR_MEM_STAT_CONFIGURED
892 		 *
893 		 * mark span actually configured.
894 		 */
895 		if (size == mbp->size && rv != KPHYSM_ESPAN) {
896 			*status = DR_MEM_STAT_UNCONFIGURED;
897 		} else {
898 			DR_DBG_MEM("failed (partial) to configure span"
899 			    " 0x%lx.0x%lx (%d)\n", addr, addsz, rv);
900 			*status = DR_MEM_STAT_CONFIGURED;
901 			mbp->addr = addr;
902 			mbp->size -= size;
903 		}
904 
905 		rv = cvt_err(rv);
906 		i_dr_mem_update();
907 	} else {
908 		/*
909 		 * The reserved feature is disabled, add whole mblock.
910 		 */
911 		rv = mem_add(btop(addr), btop(size));
912 		DR_DBG_MEM("addr=0x%lx size=0x%lx rv=%d\n", addr, size, rv);
913 		if (rv) {
914 			rv = cvt_err(rv);
915 			*status = DR_MEM_STAT_UNCONFIGURED;
916 		} else {
917 			*status = DR_MEM_STAT_CONFIGURED;
918 		}
919 	}
920 
921 	return (rv);
922 }
923 
924 static int
925 dr_mem_unconfigure(dr_mem_blk_t *mbp, int *status)
926 {
927 	int rv;
928 
929 	DR_DBG_MEM("dr_mem_unconfigure...\n");
930 
931 	if (!MBLK_IS_VALID(mbp)) {
932 		DR_DBG_MEM("invalid mblk 0x%lx.0x%lx\n",
933 		    mbp->addr, mbp->size);
934 			*status = DR_MEM_STAT_CONFIGURED;
935 			rv = DR_MEM_RES_EINVAL;
936 	} else if (rv = mem_del(btop(mbp->addr), btop(mbp->size))) {
937 		rv = cvt_err(rv);
938 		*status = DR_MEM_STAT_CONFIGURED;
939 	} else {
940 		*status = DR_MEM_STAT_UNCONFIGURED;
941 		rv = DR_MEM_RES_OK;
942 		DR_DBG_MEM("mblk 0x%lx.0x%lx unconfigured\n",
943 		    mbp->addr, mbp->size);
944 	}
945 	return (rv);
946 }
947 
948 static int
949 dr_mem_del_stat(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
950 {
951 	int			status;
952 	int			rlen;
953 	memdelstat_t		del_stat, *stat;
954 	dr_mem_hdr_t		*rp;
955 
956 	/*
957 	 * If a mem delete is in progress, get its status.
958 	 */
959 	status = (dr_mh && (kphysm_del_status(dr_mh, &del_stat) == KPHYSM_OK));
960 
961 	/* allocate a response message, should be freed by caller */
962 	rlen = sizeof (dr_mem_hdr_t);
963 	rlen += status * sizeof (memdelstat_t);
964 	rp = kmem_zalloc(rlen, KM_SLEEP);
965 
966 	/* fill in the known data */
967 	rp->req_num = req->req_num;
968 	rp->msg_type = DR_MEM_OK;
969 	rp->msg_arg = status;
970 
971 	if (status) {
972 		/* stat struct for the response */
973 		stat = DR_MEM_RESP_DEL_STAT(rp);
974 		stat->phys_pages = ptob(del_stat.phys_pages);
975 		stat->managed = ptob(del_stat.managed);
976 		stat->collected = ptob(del_stat.collected);
977 	}
978 
979 	*resp = rp;
980 	*resp_len = rlen;
981 
982 	return (0);
983 }
984 
985 static int
986 dr_mem_del_cancel(dr_mem_hdr_t *req, dr_mem_hdr_t **resp, int *resp_len)
987 {
988 	int		rlen;
989 	dr_mem_hdr_t	*rp;
990 
991 	/* allocate a response message, should be freed by caller */
992 	rlen = sizeof (dr_mem_hdr_t);
993 	rp = kmem_zalloc(rlen, KM_SLEEP);
994 
995 	/* fill in the known data */
996 	rp->req_num = req->req_num;
997 	rp->msg_type = DR_MEM_OK;
998 	rp->msg_arg = (dr_mh && kphysm_del_cancel(dr_mh) != KPHYSM_OK) ?
999 	    DR_MEM_RES_EINVAL : DR_MEM_RES_OK;
1000 
1001 	*resp = rp;
1002 	*resp_len = rlen;
1003 
1004 	return (0);
1005 }
1006 
1007 static int
1008 dr_mem_find(dr_mem_blk_t *mbp)
1009 {
1010 	md_t		*mdp = NULL;
1011 	int		num_nodes;
1012 	int		rv = 0;
1013 	int		listsz;
1014 	mde_cookie_t	*listp = NULL;
1015 	mde_cookie_t	memnode;
1016 	char		*found = "found";
1017 
1018 	if ((mdp = md_get_handle()) == NULL) {
1019 		DR_DBG_MEM("unable to initialize machine description\n");
1020 		return (-1);
1021 	}
1022 
1023 	num_nodes = md_node_count(mdp);
1024 	ASSERT(num_nodes > 0);
1025 
1026 	listsz = num_nodes * sizeof (mde_cookie_t);
1027 	listp = kmem_zalloc(listsz, KM_SLEEP);
1028 
1029 	memnode = dr_mem_find_node_md(mbp, mdp, listp);
1030 
1031 	if (memnode == MDE_INVAL_ELEM_COOKIE) {
1032 		rv = EINVAL;
1033 		found = "not found";
1034 	}
1035 
1036 	DR_DBG_MEM("mblk 0x%lx.0x%lx %s\n", mbp->addr, mbp->size, found);
1037 
1038 	kmem_free(listp, listsz);
1039 	(void) md_fini_handle(mdp);
1040 
1041 	return (rv);
1042 }
1043 
1044 /*
1045  * Look up a particular mblk in the MD. Returns the mde_cookie_t
1046  * representing that mblk if present, and MDE_INVAL_ELEM_COOKIE
1047  * otherwise. It is assumed the scratch array has already been
1048  * allocated so that it can accommodate the worst case scenario,
1049  * every node in the MD.
1050  */
1051 static mde_cookie_t
1052 dr_mem_find_node_md(dr_mem_blk_t *mbp, md_t *mdp, mde_cookie_t *listp)
1053 {
1054 	int		idx;
1055 	int		nnodes;
1056 	mde_cookie_t	rootnode;
1057 	uint64_t	base_prop;
1058 	uint64_t	size_prop;
1059 	mde_cookie_t	result = MDE_INVAL_ELEM_COOKIE;
1060 
1061 	rootnode = md_root_node(mdp);
1062 	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
1063 
1064 	/*
1065 	 * Scan the DAG for all the mem nodes
1066 	 */
1067 	nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "mblock"),
1068 	    md_find_name(mdp, "fwd"), listp);
1069 
1070 	if (nnodes < 0) {
1071 		DR_DBG_MEM("Scan for mblks failed\n");
1072 		return (result);
1073 	}
1074 
1075 	DR_DBG_MEM("dr_mem_find_node_md: found %d mblks in the MD\n", nnodes);
1076 
1077 	/*
1078 	 * Find the mblk of interest
1079 	 */
1080 	for (idx = 0; idx < nnodes; idx++) {
1081 
1082 		if (md_get_prop_val(mdp, listp[idx], "base", &base_prop)) {
1083 			DR_DBG_MEM("Missing 'base' property for mblk node %d\n",
1084 			    idx);
1085 			break;
1086 		}
1087 
1088 		if (md_get_prop_val(mdp, listp[idx], "size", &size_prop)) {
1089 			DR_DBG_MEM("Missing 'size' property for mblk node %d\n",
1090 			    idx);
1091 			break;
1092 		}
1093 
1094 		if (base_prop <= mbp->addr &&
1095 		    (base_prop + size_prop) >= (mbp->addr + mbp->size)) {
1096 			/* found a match */
1097 			DR_DBG_MEM("dr_mem_find_node_md: found mblk "
1098 			    "0x%lx.0x%lx in MD\n", mbp->addr, mbp->size);
1099 			result = listp[idx];
1100 			break;
1101 		}
1102 	}
1103 
1104 	if (result == MDE_INVAL_ELEM_COOKIE) {
1105 		DR_DBG_MEM("mblk 0x%lx.0x%lx not in MD\n",
1106 		    mbp->addr, mbp->size);
1107 	}
1108 
1109 	return (result);
1110 }
1111 
1112 static int
1113 mem_add(pfn_t base, pgcnt_t npgs)
1114 {
1115 	int rv, rc;
1116 
1117 	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);
1118 
1119 	if (npgs == 0)
1120 		return (0);
1121 
1122 	rv = kphysm_add_memory_dynamic(base, npgs);
1123 	DR_DBG_MEM("%s: kphysm_add(0x%lx, 0x%lx) = %d", __func__, base, npgs,
1124 	    rv);
1125 	if (!rv) {
1126 		if (rc = kcage_range_add(base, npgs, KCAGE_DOWN))
1127 			cmn_err(CE_WARN, "kcage_range_add() = %d", rc);
1128 	}
1129 	return (rv);
1130 }
1131 
1132 static void
1133 del_done(void *arg, int error)
1134 {
1135 	mem_sync_t *ms = arg;
1136 
1137 	mutex_enter(&ms->lock);
1138 	ms->error = error;
1139 	ms->done = 1;
1140 	cv_signal(&ms->cond);
1141 	mutex_exit(&ms->lock);
1142 }
1143 
1144 static int
1145 mem_del(pfn_t base, pgcnt_t npgs)
1146 {
1147 	int rv, err, del_range = 0;
1148 	mem_sync_t ms;
1149 	memquery_t mq;
1150 	memhandle_t mh;
1151 	struct memlist *ml;
1152 	struct memlist *d_ml = NULL;
1153 
1154 	DR_DBG_MEM("%s: begin base=0x%lx npgs=0x%lx\n", __func__, base, npgs);
1155 
1156 	if (npgs == 0)
1157 		return (0);
1158 
1159 	if ((rv = kphysm_del_gethandle(&mh)) != KPHYSM_OK) {
1160 		cmn_err(CE_WARN, "%s: del_gethandle() = %d", __func__, rv);
1161 		return (rv);
1162 	}
1163 	if ((rv = kphysm_del_span_query(base, npgs, &mq))
1164 	    != KPHYSM_OK) {
1165 		cmn_err(CE_WARN, "%s: del_span_query() = %d", __func__, rv);
1166 		goto done;
1167 	}
1168 	if (mq.nonrelocatable) {
1169 		DR_DBG_MEM("%s: non-reloc pages = %ld",
1170 		    __func__, mq.nonrelocatable);
1171 		rv = KPHYSM_ENONRELOC;
1172 		goto done;
1173 	}
1174 	if (rv = kcage_range_delete(base, npgs)) {
1175 		cmn_err(CE_WARN, "%s: del_range() = %d", __func__, rv);
1176 		goto done;
1177 	} else {
1178 		del_range++;
1179 	}
1180 	if ((rv = kphysm_del_span(mh, base, npgs)) != KPHYSM_OK) {
1181 		cmn_err(CE_WARN, "%s: del_span() = %d", __func__, rv);
1182 		goto done;
1183 	}
1184 	if ((rv = memlist_add_span(ptob(base), ptob(npgs), &d_ml))
1185 	    != MEML_SPANOP_OK) {
1186 		cmn_err(CE_WARN, "%s: add_span() = %d", __func__, rv);
1187 		goto done;
1188 	}
1189 
1190 	DR_DBG_MEM("%s: reserved=0x%lx", __func__, npgs);
1191 
1192 	bzero((void *) &ms, sizeof (ms));
1193 
1194 	mutex_init(&ms.lock, NULL, MUTEX_DRIVER, NULL);
1195 	cv_init(&ms.cond, NULL, CV_DRIVER, NULL);
1196 	mutex_enter(&ms.lock);
1197 
1198 	if ((rv = kphysm_del_start(mh, del_done, (void *) &ms)) == KPHYSM_OK) {
1199 		/*
1200 		 * Since we've called drctl_config_init, we are the only
1201 		 * DR ctl operation in progress.  Set dr_mh to the
1202 		 * delete memhandle for use by stat and cancel.
1203 		 */
1204 		ASSERT(dr_mh == NULL);
1205 		dr_mh = mh;
1206 
1207 		/*
1208 		 * Wait for completion or interrupt.
1209 		 */
1210 		while (!ms.done) {
1211 			if (cv_wait_sig(&ms.cond, &ms.lock) == 0) {
1212 				/*
1213 				 * There is a pending signal.
1214 				 */
1215 				(void) kphysm_del_cancel(mh);
1216 				DR_DBG_MEM("%s: cancel", __func__);
1217 				/*
1218 				 * Wait for completion.
1219 				 */
1220 				while (!ms.done)
1221 					cv_wait(&ms.cond, &ms.lock);
1222 			}
1223 		}
1224 		dr_mh = NULL;
1225 		rv = ms.error;
1226 	} else {
1227 		DR_DBG_MEM("%s: del_start() = %d", __func__, rv);
1228 	}
1229 
1230 	mutex_exit(&ms.lock);
1231 	cv_destroy(&ms.cond);
1232 	mutex_destroy(&ms.lock);
1233 
1234 done:
1235 	if (rv && del_range) {
1236 		/*
1237 		 * Add back the spans to the kcage growth list.
1238 		 */
1239 		for (ml = d_ml; ml; ml = ml->next)
1240 			if (err = kcage_range_add(btop(ml->address),
1241 			    btop(ml->size), KCAGE_DOWN))
1242 				cmn_err(CE_WARN, "kcage_range_add() = %d", err);
1243 	}
1244 	memlist_free_list(d_ml);
1245 
1246 	if ((err = kphysm_del_release(mh)) != KPHYSM_OK)
1247 		cmn_err(CE_WARN, "%s: del_release() = %d", __func__, err);
1248 
1249 	DR_DBG_MEM("%s: rv=%d", __func__, rv);
1250 
1251 	return (rv);
1252 }
1253