xref: /titanic_52/usr/src/uts/sun4v/io/dr_cpu.c (revision 43412a427a2387ef15ab084d8f30a56a13e32cf7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * sun4v CPU DR Module
29  */
30 
31 #include <sys/modctl.h>
32 #include <sys/processor.h>
33 #include <sys/cpuvar.h>
34 #include <sys/cpupart.h>
35 #include <sys/sunddi.h>
36 #include <sys/sunndi.h>
37 #include <sys/note.h>
38 #include <sys/sysevent/dr.h>
39 #include <sys/hypervisor_api.h>
40 #include <sys/mach_descrip.h>
41 #include <sys/mdesc.h>
42 #include <sys/ds.h>
43 #include <sys/drctl.h>
44 #include <sys/dr_util.h>
45 #include <sys/dr_cpu.h>
46 #include <sys/promif.h>
47 #include <sys/machsystm.h>
48 
49 
50 static struct modlmisc modlmisc = {
51 	&mod_miscops,
52 	"sun4v CPU DR"
53 };
54 
55 static struct modlinkage modlinkage = {
56 	MODREV_1,
57 	(void *)&modlmisc,
58 	NULL
59 };
60 
61 typedef int (*fn_t)(processorid_t, int *, boolean_t);
62 
63 /*
64  * Global DS Handle
65  */
66 static ds_svc_hdl_t ds_handle;
67 
68 /*
69  * Supported DS Capability Versions
70  */
71 static ds_ver_t		dr_cpu_vers[] = { { 1, 1 }, { 1, 0 } };
72 #define	DR_CPU_NVERS	(sizeof (dr_cpu_vers) / sizeof (dr_cpu_vers[0]))
73 
74 static ds_ver_t		version;
75 
76 /*
77  * DS Capability Description
78  */
79 static ds_capability_t dr_cpu_cap = {
80 	DR_CPU_DS_ID,		/* svc_id */
81 	dr_cpu_vers,		/* vers */
82 	DR_CPU_NVERS		/* nvers */
83 };
84 
85 #define	DRCPU_VERS_EQ(_maj, _min) \
86 	((version.major == (_maj)) && (version.minor == (_min)))
87 
88 #define	DRCPU_VERS_GTEQ(_maj, _min) \
89 	((version.major > (_maj)) ||					\
90 	((version.major == (_maj)) && (version.minor >= (_min))))
91 
92 /*
93  * DS Callbacks
94  */
95 static void dr_cpu_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
96 static void dr_cpu_unreg_handler(ds_cb_arg_t arg);
97 static void dr_cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
98 
99 /*
100  * DS Client Ops Vector
101  */
102 static ds_clnt_ops_t dr_cpu_ops = {
103 	dr_cpu_reg_handler,	/* ds_reg_cb */
104 	dr_cpu_unreg_handler,	/* ds_unreg_cb */
105 	dr_cpu_data_handler,	/* ds_data_cb */
106 	NULL			/* cb_arg */
107 };
108 
109 /*
110  * Operation Results
111  *
112  * Used internally to gather results while an operation on a
113  * list of CPUs is in progress. In particular, it is used to
114  * keep track of which CPUs have already failed so that they are
115  * not processed further, and the manner in which they failed.
116  */
117 typedef struct {
118 	uint32_t	cpuid;
119 	uint32_t	result;
120 	uint32_t	status;
121 	char		*string;
122 } dr_cpu_res_t;
123 
124 #define	DR_CPU_MAX_ERR_LEN	64	/* maximum error string length */
125 
126 /*
127  * Internal Functions
128  */
129 static int dr_cpu_init(void);
130 static int dr_cpu_fini(void);
131 
132 static int dr_cpu_list_wrk(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *);
133 static int dr_cpu_list_status(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *);
134 
135 static int dr_cpu_unconfigure(processorid_t, int *status, boolean_t force);
136 static int dr_cpu_configure(processorid_t, int *status, boolean_t force);
137 static int dr_cpu_status(processorid_t, int *status);
138 
139 static void dr_cpu_check_cpus(dr_cpu_hdr_t *req, dr_cpu_res_t *res);
140 static void dr_cpu_check_psrset(uint32_t *cpuids, dr_cpu_res_t *res, int nres);
141 static int dr_cpu_check_bound_thr(cpu_t *cp, dr_cpu_res_t *res);
142 
143 static dr_cpu_res_t *dr_cpu_res_array_init(dr_cpu_hdr_t *, drctl_rsrc_t *, int);
144 static void dr_cpu_res_array_fini(dr_cpu_res_t *res, int nres);
145 static size_t dr_cpu_pack_response(dr_cpu_hdr_t *req, dr_cpu_res_t *res,
146     dr_cpu_hdr_t **respp);
147 
148 static int dr_cpu_probe(processorid_t newcpuid);
149 static int dr_cpu_deprobe(processorid_t cpuid);
150 
151 static dev_info_t *dr_cpu_find_node(processorid_t cpuid);
152 static mde_cookie_t dr_cpu_find_node_md(processorid_t, md_t *, mde_cookie_t *);
153 
154 int
155 _init(void)
156 {
157 	int	status;
158 
159 	/* check that CPU DR is enabled */
160 	if (dr_is_disabled(DR_TYPE_CPU)) {
161 		cmn_err(CE_CONT, "!CPU DR is disabled\n");
162 		return (-1);
163 	}
164 
165 	if ((status = dr_cpu_init()) != 0) {
166 		cmn_err(CE_NOTE, "CPU DR initialization failed");
167 		return (status);
168 	}
169 
170 	if ((status = mod_install(&modlinkage)) != 0) {
171 		(void) dr_cpu_fini();
172 	}
173 
174 	return (status);
175 }
176 
177 int
178 _info(struct modinfo *modinfop)
179 {
180 	return (mod_info(&modlinkage, modinfop));
181 }
182 
183 int dr_cpu_allow_unload;
184 
185 int
186 _fini(void)
187 {
188 	int	status;
189 
190 	if (dr_cpu_allow_unload == 0)
191 		return (EBUSY);
192 
193 	if ((status = mod_remove(&modlinkage)) == 0) {
194 		(void) dr_cpu_fini();
195 	}
196 
197 	return (status);
198 }
199 
200 static int
201 dr_cpu_init(void)
202 {
203 	int	rv;
204 
205 	if ((rv = ds_cap_init(&dr_cpu_cap, &dr_cpu_ops)) != 0) {
206 		cmn_err(CE_NOTE, "ds_cap_init failed: %d", rv);
207 		return (-1);
208 	}
209 
210 	return (0);
211 }
212 
213 static int
214 dr_cpu_fini(void)
215 {
216 	int	rv;
217 
218 	if ((rv = ds_cap_fini(&dr_cpu_cap)) != 0) {
219 		cmn_err(CE_NOTE, "ds_cap_fini failed: %d", rv);
220 		return (-1);
221 	}
222 
223 	return (0);
224 }
225 
226 static void
227 dr_cpu_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
228 {
229 	DR_DBG_CPU("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg,
230 	    ver->major, ver->minor, hdl);
231 
232 	version.major = ver->major;
233 	version.minor = ver->minor;
234 	ds_handle = hdl;
235 }
236 
237 static void
238 dr_cpu_unreg_handler(ds_cb_arg_t arg)
239 {
240 	DR_DBG_CPU("unreg_handler: arg=0x%p\n", arg);
241 
242 	ds_handle = DS_INVALID_HDL;
243 }
244 
245 static void
246 dr_cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
247 {
248 	_NOTE(ARGUNUSED(arg))
249 
250 	dr_cpu_hdr_t	*req = buf;
251 	dr_cpu_hdr_t	err_resp;
252 	dr_cpu_hdr_t	*resp = &err_resp;
253 	int		resp_len = 0;
254 	int		rv;
255 
256 	/*
257 	 * Sanity check the message
258 	 */
259 	if (buflen < sizeof (dr_cpu_hdr_t)) {
260 		DR_DBG_CPU("incoming message short: expected at least %ld "
261 		    "bytes, received %ld\n", sizeof (dr_cpu_hdr_t), buflen);
262 		goto done;
263 	}
264 
265 	if (req == NULL) {
266 		DR_DBG_CPU("empty message: expected at least %ld bytes\n",
267 		    sizeof (dr_cpu_hdr_t));
268 		goto done;
269 	}
270 
271 	DR_DBG_CPU("incoming request:\n");
272 	DR_DBG_DUMP_MSG(buf, buflen);
273 
274 	if (req->num_records > NCPU) {
275 		DR_DBG_CPU("CPU list too long: %d when %d is the maximum\n",
276 		    req->num_records, NCPU);
277 		goto done;
278 	}
279 
280 	if (req->num_records == 0) {
281 		DR_DBG_CPU("No CPU specified for operation\n");
282 		goto done;
283 	}
284 
285 	/*
286 	 * Process the command
287 	 */
288 	switch (req->msg_type) {
289 	case DR_CPU_CONFIGURE:
290 	case DR_CPU_UNCONFIGURE:
291 	case DR_CPU_FORCE_UNCONFIG:
292 		if ((rv = dr_cpu_list_wrk(req, &resp, &resp_len)) != 0) {
293 			DR_DBG_CPU("%s%s failed (%d)\n",
294 			    (req->msg_type == DR_CPU_CONFIGURE) ?
295 			    "CPU configure" : "CPU unconfigure",
296 			    (req->msg_type == DR_CPU_FORCE_UNCONFIG) ?
297 			    " (forced)" : "", rv);
298 		}
299 		break;
300 
301 	case DR_CPU_STATUS:
302 		if ((rv = dr_cpu_list_status(req, &resp, &resp_len)) != 0)
303 			DR_DBG_CPU("CPU status failed (%d)\n", rv);
304 		break;
305 
306 	default:
307 		cmn_err(CE_NOTE, "unsupported DR operation (%d)",
308 		    req->msg_type);
309 		break;
310 	}
311 
312 done:
313 	/* check if an error occurred */
314 	if (resp == &err_resp) {
315 		resp->req_num = (req) ? req->req_num : 0;
316 		resp->msg_type = DR_CPU_ERROR;
317 		resp->num_records = 0;
318 		resp_len = sizeof (dr_cpu_hdr_t);
319 	}
320 
321 	DR_DBG_CPU("outgoing response:\n");
322 	DR_DBG_DUMP_MSG(resp, resp_len);
323 
324 	/* send back the response */
325 	if (ds_cap_send(ds_handle, resp, resp_len) != 0) {
326 		DR_DBG_CPU("ds_send failed\n");
327 	}
328 
329 	/* free any allocated memory */
330 	if (DRCPU_VERS_GTEQ(1, 1) || (resp != &err_resp)) {
331 		DR_DBG_KMEM("%s: free addr %p size %d\n",
332 		    __func__, (void *)resp, resp_len);
333 		kmem_free(resp, resp_len);
334 	}
335 }
336 
337 /*
338  * Create a response message which consists of a header followed
339  * by the error string passed in.
340  */
341 static size_t
342 dr_cpu_err_resp(dr_cpu_hdr_t *req, dr_cpu_hdr_t **respp, char *msg)
343 {
344 	size_t size;
345 	dr_cpu_hdr_t *resp;
346 
347 	ASSERT((msg != NULL) && (strlen(msg) > 0));
348 
349 	size = sizeof (*req) + strlen(msg) + 1;
350 	resp = kmem_alloc(size, KM_SLEEP);
351 	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
352 	    __func__, (void *)resp, size);
353 
354 	resp->req_num = req->req_num;
355 	resp->msg_type = DR_CPU_ERROR;
356 	resp->num_records = 0;
357 
358 	(void) strcpy((char *)(resp) + sizeof (*resp), msg);
359 
360 	*respp = resp;
361 
362 	return (size);
363 }
364 
365 /*
366  * Common routine to config or unconfig multiple cpus.  The unconfig
367  * case checks with the OS to see if the removal of cpus will be
368  * permitted, but can be overridden by the "force" version of the
369  * command.  Otherwise, the logic for both cases is identical.
370  *
371  * Note: Do not modify result buffer or length on error.
372  */
373 static int
374 dr_cpu_list_wrk(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len)
375 {
376 	int		rv;
377 	int		idx;
378 	int		count;
379 	fn_t		dr_fn;
380 	int		se_hint;
381 	boolean_t	force = B_FALSE;
382 	uint32_t	*req_cpus;
383 	dr_cpu_res_t	*res;
384 	int		drctl_cmd;
385 	int		drctl_flags = 0;
386 	drctl_rsrc_t	*drctl_req;
387 	size_t		drctl_req_len;
388 	drctl_resp_t	*drctl_resp;
389 	drctl_rsrc_t	*drctl_rsrc;
390 	size_t		drctl_resp_len = 0;
391 	drctl_cookie_t	drctl_res_ck;
392 
393 	ASSERT((req != NULL) && (req->num_records != 0));
394 
395 	count = req->num_records;
396 
397 	/*
398 	 * Extract all information that is specific
399 	 * to the various types of operations.
400 	 */
401 	switch (req->msg_type) {
402 	case DR_CPU_CONFIGURE:
403 		dr_fn = dr_cpu_configure;
404 		drctl_cmd = DRCTL_CPU_CONFIG_REQUEST;
405 		se_hint = SE_HINT_INSERT;
406 		break;
407 	case DR_CPU_FORCE_UNCONFIG:
408 		drctl_flags = DRCTL_FLAG_FORCE;
409 		force = B_TRUE;
410 		_NOTE(FALLTHROUGH)
411 	case DR_CPU_UNCONFIGURE:
412 		dr_fn = dr_cpu_unconfigure;
413 		drctl_cmd = DRCTL_CPU_UNCONFIG_REQUEST;
414 		se_hint = SE_HINT_REMOVE;
415 		break;
416 	default:
417 		/* Programming error if we reach this. */
418 		cmn_err(CE_NOTE,
419 		    "%s: bad msg_type %d\n", __func__, req->msg_type);
420 		ASSERT(0);
421 		return (-1);
422 	}
423 
424 	/* the incoming array of cpuids to operate on */
425 	req_cpus = DR_CPU_CMD_CPUIDS(req);
426 
427 	/* allocate drctl request msg based on incoming resource count */
428 	drctl_req_len = sizeof (drctl_rsrc_t) * count;
429 	drctl_req = kmem_zalloc(drctl_req_len, KM_SLEEP);
430 	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
431 	    __func__, (void *)drctl_req, drctl_req_len);
432 
433 	/* copy the cpuids for the drctl call from the incoming request msg */
434 	for (idx = 0; idx < count; idx++)
435 		drctl_req[idx].res_cpu_id = req_cpus[idx];
436 
437 	rv = drctl_config_init(drctl_cmd, drctl_flags, drctl_req,
438 	    count, &drctl_resp, &drctl_resp_len, &drctl_res_ck);
439 
440 	ASSERT((drctl_resp != NULL) && (drctl_resp_len != 0));
441 
442 	if (rv != 0) {
443 		DR_DBG_CPU("%s: drctl_config_init "
444 		    "returned: %d\n", __func__, rv);
445 
446 		if (DRCPU_VERS_EQ(1, 0)) {
447 			rv = -1;
448 		} else {
449 			ASSERT(DRCPU_VERS_GTEQ(1, 1));
450 			ASSERT(drctl_resp->resp_type == DRCTL_RESP_ERR);
451 
452 			*resp_len = dr_cpu_err_resp(req,
453 			    resp, drctl_resp->resp_err_msg);
454 		}
455 
456 		DR_DBG_KMEM("%s: free addr %p size %ld\n",
457 		    __func__, (void *)drctl_resp, drctl_resp_len);
458 		kmem_free(drctl_resp, drctl_resp_len);
459 		DR_DBG_KMEM("%s: free addr %p size %ld\n",
460 		    __func__, (void *)drctl_req, drctl_req_len);
461 		kmem_free(drctl_req, drctl_req_len);
462 
463 		return (rv);
464 	}
465 
466 	ASSERT(drctl_resp->resp_type == DRCTL_RESP_OK);
467 
468 	drctl_rsrc = drctl_resp->resp_resources;
469 
470 	/* create the result scratch array */
471 	res = dr_cpu_res_array_init(req, drctl_rsrc, count);
472 
473 	/*
474 	 * For unconfigure, check if there are any conditions
475 	 * that will cause the operation to fail. These are
476 	 * performed before the actual unconfigure attempt so
477 	 * that a meaningful error message can be generated.
478 	 */
479 	if (req->msg_type != DR_CPU_CONFIGURE)
480 		dr_cpu_check_cpus(req, res);
481 
482 	/* perform the specified operation on each of the CPUs */
483 	for (idx = 0; idx < count; idx++) {
484 		int result;
485 		int status;
486 
487 		/*
488 		 * If no action will be taken against the current
489 		 * CPU, update the drctl resource information to
490 		 * ensure that it gets recovered properly during
491 		 * the drctl fini() call.
492 		 */
493 		if (res[idx].result != DR_CPU_RES_OK) {
494 			drctl_req[idx].status = DRCTL_STATUS_CONFIG_FAILURE;
495 			continue;
496 		}
497 
498 		/* call the function to perform the actual operation */
499 		result = (*dr_fn)(req_cpus[idx], &status, force);
500 
501 		/* save off results of the operation */
502 		res[idx].result = result;
503 		res[idx].status = status;
504 
505 		/* save result for drctl fini() reusing init() msg memory */
506 		drctl_req[idx].status = (result != DR_CPU_RES_OK) ?
507 		    DRCTL_STATUS_CONFIG_FAILURE : DRCTL_STATUS_CONFIG_SUCCESS;
508 
509 		DR_DBG_CPU("%s: cpuid %d status %d result %d off '%s'\n",
510 		    __func__, req_cpus[idx], drctl_req[idx].status, result,
511 		    (res[idx].string) ? res[idx].string : "");
512 	}
513 
514 	if ((rv = drctl_config_fini(&drctl_res_ck, drctl_req, count)) != 0)
515 		DR_DBG_CPU("%s: drctl_config_fini "
516 		    "returned: %d\n", __func__, rv);
517 
518 	/*
519 	 * Operation completed without any fatal errors.
520 	 * Pack the response for transmission.
521 	 */
522 	*resp_len = dr_cpu_pack_response(req, res, resp);
523 
524 	/* notify interested parties about the operation */
525 	dr_generate_event(DR_TYPE_CPU, se_hint);
526 
527 	/*
528 	 * Deallocate any scratch memory.
529 	 */
530 	DR_DBG_KMEM("%s: free addr %p size %ld\n",
531 	    __func__, (void *)drctl_resp, drctl_resp_len);
532 	kmem_free(drctl_resp, drctl_resp_len);
533 	DR_DBG_KMEM("%s: free addr %p size %ld\n",
534 	    __func__, (void *)drctl_req, drctl_req_len);
535 	kmem_free(drctl_req, drctl_req_len);
536 
537 	dr_cpu_res_array_fini(res, count);
538 
539 	return (0);
540 }
541 
542 /*
543  * Allocate and initialize a result array based on the initial
544  * drctl operation. A valid result array is always returned.
545  */
546 static dr_cpu_res_t *
547 dr_cpu_res_array_init(dr_cpu_hdr_t *req, drctl_rsrc_t *rsrc, int nrsrc)
548 {
549 	int		idx;
550 	dr_cpu_res_t	*res;
551 	char		*err_str;
552 	size_t		err_len;
553 
554 	/* allocate zero filled buffer to initialize fields */
555 	res = kmem_zalloc(nrsrc * sizeof (dr_cpu_res_t), KM_SLEEP);
556 	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
557 	    __func__, (void *)res, nrsrc * sizeof (dr_cpu_res_t));
558 
559 	/*
560 	 * Fill in the result information for each resource.
561 	 */
562 	for (idx = 0; idx < nrsrc; idx++) {
563 		res[idx].cpuid = rsrc[idx].res_cpu_id;
564 		res[idx].result = DR_CPU_RES_OK;
565 
566 		if (rsrc[idx].status == DRCTL_STATUS_ALLOW)
567 			continue;
568 
569 		/*
570 		 * Update the state information for this CPU.
571 		 */
572 		res[idx].result = DR_CPU_RES_BLOCKED;
573 		res[idx].status = (req->msg_type == DR_CPU_CONFIGURE) ?
574 		    DR_CPU_STAT_UNCONFIGURED : DR_CPU_STAT_CONFIGURED;
575 
576 		/*
577 		 * If an error string exists, copy it out of the
578 		 * message buffer. This eliminates any dependency
579 		 * on the memory allocated for the message buffer
580 		 * itself.
581 		 */
582 		if (rsrc[idx].offset != NULL) {
583 			err_str = (char *)rsrc + rsrc[idx].offset;
584 			err_len = strlen(err_str) + 1;
585 
586 			res[idx].string = kmem_alloc(err_len, KM_SLEEP);
587 			DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
588 			    __func__, (void *)(res[idx].string), err_len);
589 			bcopy(err_str, res[idx].string, err_len);
590 		}
591 	}
592 
593 	return (res);
594 }
595 
596 static void
597 dr_cpu_res_array_fini(dr_cpu_res_t *res, int nres)
598 {
599 	int	idx;
600 	size_t	str_len;
601 
602 	for (idx = 0; idx < nres; idx++) {
603 		/* deallocate the error string if present */
604 		if (res[idx].string) {
605 			str_len = strlen(res[idx].string) + 1;
606 			DR_DBG_KMEM("%s: free addr %p size %ld\n",
607 			    __func__, (void *)(res[idx].string), str_len);
608 			kmem_free(res[idx].string, str_len);
609 		}
610 	}
611 
612 	/* deallocate the result array itself */
613 	DR_DBG_KMEM("%s: free addr %p size %ld\n",
614 	    __func__, (void *)res, sizeof (dr_cpu_res_t) * nres);
615 	kmem_free(res, sizeof (dr_cpu_res_t) * nres);
616 }
617 
618 /*
619  * Allocate and pack a response message for transmission based
620  * on the specified result array. A valid response message and
621  * valid size information is always returned.
622  */
623 static size_t
624 dr_cpu_pack_response(dr_cpu_hdr_t *req, dr_cpu_res_t *res, dr_cpu_hdr_t **respp)
625 {
626 	int		idx;
627 	dr_cpu_hdr_t	*resp;
628 	dr_cpu_stat_t	*resp_stat;
629 	size_t		resp_len;
630 	uint32_t	curr_off;
631 	caddr_t		curr_str;
632 	size_t		str_len;
633 	size_t		stat_len;
634 	int		nstat = req->num_records;
635 
636 	/*
637 	 * Calculate the size of the response message
638 	 * and allocate an appropriately sized buffer.
639 	 */
640 	resp_len = 0;
641 
642 	/* add the header size */
643 	resp_len += sizeof (dr_cpu_hdr_t);
644 
645 	/* add the stat array size */
646 	stat_len = sizeof (dr_cpu_stat_t) * nstat;
647 	resp_len += stat_len;
648 
649 	/* add the size of any error strings */
650 	for (idx = 0; idx < nstat; idx++) {
651 		if (res[idx].string != NULL) {
652 			resp_len += strlen(res[idx].string) + 1;
653 		}
654 	}
655 
656 	/* allocate the message buffer */
657 	resp = kmem_zalloc(resp_len, KM_SLEEP);
658 	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
659 	    __func__, (void *)resp, resp_len);
660 
661 	/*
662 	 * Fill in the header information.
663 	 */
664 	resp->req_num = req->req_num;
665 	resp->msg_type = DR_CPU_OK;
666 	resp->num_records = nstat;
667 
668 	/*
669 	 * Fill in the stat information.
670 	 */
671 	resp_stat = DR_CPU_RESP_STATS(resp);
672 
673 	/* string offsets start immediately after stat array */
674 	curr_off = sizeof (dr_cpu_hdr_t) + stat_len;
675 	curr_str = (char *)resp_stat + stat_len;
676 
677 	for (idx = 0; idx < nstat; idx++) {
678 		resp_stat[idx].cpuid = res[idx].cpuid;
679 		resp_stat[idx].result = res[idx].result;
680 		resp_stat[idx].status = res[idx].status;
681 
682 		if (res[idx].string != NULL) {
683 			/* copy over the error string */
684 			str_len = strlen(res[idx].string) + 1;
685 			bcopy(res[idx].string, curr_str, str_len);
686 			resp_stat[idx].string_off = curr_off;
687 
688 			curr_off += str_len;
689 			curr_str += str_len;
690 		}
691 	}
692 
693 	/* buffer should be exactly filled */
694 	ASSERT(curr_off == resp_len);
695 
696 	*respp = resp;
697 	return (resp_len);
698 }
699 
700 /*
701  * Check for conditions that will prevent a CPU from being offlined.
702  * This provides the opportunity to generate useful information to
703  * help diagnose the failure rather than letting the offline attempt
704  * fail in a more generic way.
705  */
706 static void
707 dr_cpu_check_cpus(dr_cpu_hdr_t *req, dr_cpu_res_t *res)
708 {
709 	int		idx;
710 	cpu_t		*cp;
711 	uint32_t	*cpuids;
712 
713 	ASSERT((req->msg_type == DR_CPU_UNCONFIGURE) ||
714 	    (req->msg_type == DR_CPU_FORCE_UNCONFIG));
715 
716 	DR_DBG_CPU("dr_cpu_check_cpus...\n");
717 
718 	/* array of cpuids start just after the header */
719 	cpuids = DR_CPU_CMD_CPUIDS(req);
720 
721 	mutex_enter(&cpu_lock);
722 
723 	/*
724 	 * Always check processor set membership first. The
725 	 * last CPU in a processor set will fail to offline
726 	 * even if the operation if forced, so any failures
727 	 * should always be reported.
728 	 */
729 	dr_cpu_check_psrset(cpuids, res, req->num_records);
730 
731 	/* process each cpu that is part of the request */
732 	for (idx = 0; idx < req->num_records; idx++) {
733 
734 		/* nothing to check if the CPU has already failed */
735 		if (res[idx].result != DR_CPU_RES_OK)
736 			continue;
737 
738 		if ((cp = cpu_get(cpuids[idx])) == NULL)
739 			continue;
740 
741 		/*
742 		 * Only check if there are bound threads if the
743 		 * operation is not a forced unconfigure. In a
744 		 * forced request, threads are automatically
745 		 * unbound before they are offlined.
746 		 */
747 		if (req->msg_type == DR_CPU_UNCONFIGURE) {
748 			/*
749 			 * The return value is only interesting if other
750 			 * checks are added to this loop and a decision
751 			 * is needed on whether to continue checking.
752 			 */
753 			(void) dr_cpu_check_bound_thr(cp, &res[idx]);
754 		}
755 	}
756 
757 	mutex_exit(&cpu_lock);
758 }
759 
760 /*
761  * Examine the processor set configuration for the specified
762  * CPUs and see if the unconfigure operation would result in
763  * trying to remove the last CPU in any processor set.
764  */
765 static void
766 dr_cpu_check_psrset(uint32_t *cpuids, dr_cpu_res_t *res, int nres)
767 {
768 	int		cpu_idx;
769 	int		set_idx;
770 	cpu_t		*cp;
771 	cpupart_t	*cpp;
772 	char		err_str[DR_CPU_MAX_ERR_LEN];
773 	size_t		err_len;
774 	struct {
775 		cpupart_t	*cpp;
776 		int		ncpus;
777 	} *psrset;
778 
779 	ASSERT(MUTEX_HELD(&cpu_lock));
780 
781 	/*
782 	 * Allocate a scratch array to count the CPUs in
783 	 * the various processor sets. A CPU always belongs
784 	 * to exactly one processor set, so by definition,
785 	 * the scratch array never needs to be larger than
786 	 * the number of CPUs.
787 	 */
788 	psrset = kmem_zalloc(sizeof (*psrset) * nres, KM_SLEEP);
789 	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
790 	    __func__, (void *)psrset, sizeof (*psrset) * nres);
791 
792 	for (cpu_idx = 0; cpu_idx < nres; cpu_idx++) {
793 
794 		/* skip any CPUs that have already failed */
795 		if (res[cpu_idx].result != DR_CPU_RES_OK)
796 			continue;
797 
798 		if ((cp = cpu_get(cpuids[cpu_idx])) == NULL)
799 			continue;
800 
801 		cpp = cp->cpu_part;
802 
803 		/* lookup the set this CPU belongs to */
804 		for (set_idx = 0; set_idx < nres; set_idx++) {
805 
806 			/* matching set found */
807 			if (cpp == psrset[set_idx].cpp)
808 				break;
809 
810 			/* set not found, start a new entry */
811 			if (psrset[set_idx].cpp == NULL) {
812 				psrset[set_idx].cpp = cpp;
813 				psrset[set_idx].ncpus = cpp->cp_ncpus;
814 				break;
815 			}
816 		}
817 
818 		ASSERT(set_idx != nres);
819 
820 		/*
821 		 * Remove the current CPU from the set total but only
822 		 * generate an error for the last CPU. The correct CPU
823 		 * will get the error because the unconfigure attempts
824 		 * will occur in the same order in which the CPUs are
825 		 * examined in this loop.  The cp_ncpus field of a
826 		 * cpupart_t counts only online cpus, so it is safe
827 		 * to remove an offline cpu without testing ncpus.
828 		 */
829 		if (cp->cpu_flags & CPU_OFFLINE)
830 			continue;
831 
832 		if (--psrset[set_idx].ncpus == 0) {
833 			/*
834 			 * Fill in the various pieces of information
835 			 * to report that the operation will fail.
836 			 */
837 			res[cpu_idx].result = DR_CPU_RES_BLOCKED;
838 			res[cpu_idx].status = DR_CPU_STAT_CONFIGURED;
839 
840 			(void) snprintf(err_str, DR_CPU_MAX_ERR_LEN,
841 			    "last online cpu in processor set %d", cpp->cp_id);
842 
843 			err_len = strlen(err_str) + 1;
844 
845 			res[cpu_idx].string = kmem_alloc(err_len, KM_SLEEP);
846 			DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
847 			    __func__, (void *)(res[cpu_idx].string), err_len);
848 			bcopy(err_str, res[cpu_idx].string, err_len);
849 
850 			DR_DBG_CPU("cpu %d: %s\n", cpuids[cpu_idx], err_str);
851 		}
852 	}
853 
854 	DR_DBG_KMEM("%s: free addr %p size %ld\n",
855 	    __func__, (void *)psrset, sizeof (*psrset) * nres);
856 	kmem_free(psrset, sizeof (*psrset) * nres);
857 }
858 
859 /*
860  * Check if any threads are bound to the specified CPU. If the
861  * condition is true, DR_CPU_RES_BLOCKED is returned and an error
862  * string is generated and placed in the specified result structure.
863  * Otherwise, DR_CPU_RES_OK is returned.
864  */
865 static int
866 dr_cpu_check_bound_thr(cpu_t *cp, dr_cpu_res_t *res)
867 {
868 	int		nbound;
869 	proc_t		*pp;
870 	kthread_t	*tp;
871 	char		err_str[DR_CPU_MAX_ERR_LEN];
872 	size_t		err_len;
873 
874 	/*
875 	 * Error string allocation makes an assumption
876 	 * that no blocking condition has been identified.
877 	 */
878 	ASSERT(res->result == DR_CPU_RES_OK);
879 	ASSERT(res->string == NULL);
880 
881 	ASSERT(MUTEX_HELD(&cpu_lock));
882 
883 	mutex_enter(&pidlock);
884 
885 	nbound = 0;
886 
887 	/*
888 	 * Walk the active processes, checking if each
889 	 * thread belonging to the process is bound.
890 	 */
891 	for (pp = practive; (pp != NULL) && (nbound <= 1); pp = pp->p_next) {
892 		mutex_enter(&pp->p_lock);
893 
894 		tp = pp->p_tlist;
895 
896 		if ((tp == NULL) || (pp->p_flag & SSYS)) {
897 			mutex_exit(&pp->p_lock);
898 			continue;
899 		}
900 
901 		do {
902 			if (tp->t_bind_cpu != cp->cpu_id)
903 				continue;
904 
905 			/*
906 			 * Update the running total of bound
907 			 * threads. Continue the search until
908 			 * it can be determined if more than
909 			 * one thread is bound to the CPU.
910 			 */
911 			if (++nbound > 1)
912 				break;
913 
914 		} while ((tp = tp->t_forw) != pp->p_tlist);
915 
916 		mutex_exit(&pp->p_lock);
917 	}
918 
919 	mutex_exit(&pidlock);
920 
921 	if (nbound) {
922 		/*
923 		 * Threads are bound to the CPU. Fill in
924 		 * various pieces of information to report
925 		 * that the operation will fail.
926 		 */
927 		res->result = DR_CPU_RES_BLOCKED;
928 		res->status = DR_CPU_STAT_CONFIGURED;
929 
930 		(void) snprintf(err_str, DR_CPU_MAX_ERR_LEN, "cpu has bound "
931 		    "thread%s", (nbound > 1) ? "s" : "");
932 
933 		err_len = strlen(err_str) + 1;
934 
935 		res->string = kmem_alloc(err_len, KM_SLEEP);
936 		DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
937 		    __func__, (void *)(res->string), err_len);
938 		bcopy(err_str, res->string, err_len);
939 
940 		DR_DBG_CPU("cpu %d: %s\n", cp->cpu_id, err_str);
941 	}
942 
943 	return (res->result);
944 }
945 
946 /*
947  * Do not modify result buffer or length on error.
948  */
949 static int
950 dr_cpu_list_status(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len)
951 {
952 	int		idx;
953 	int		result;
954 	int		status;
955 	int		rlen;
956 	uint32_t	*cpuids;
957 	dr_cpu_hdr_t	*rp;
958 	dr_cpu_stat_t	*stat;
959 	md_t		*mdp = NULL;
960 	int		num_nodes;
961 	int		listsz;
962 	mde_cookie_t	*listp = NULL;
963 	mde_cookie_t	cpunode;
964 	boolean_t	walk_md = B_FALSE;
965 
966 	/* the incoming array of cpuids to configure */
967 	cpuids = DR_CPU_CMD_CPUIDS(req);
968 
969 	/* allocate a response message */
970 	rlen = sizeof (dr_cpu_hdr_t);
971 	rlen += req->num_records * sizeof (dr_cpu_stat_t);
972 	rp = kmem_zalloc(rlen, KM_SLEEP);
973 	DR_DBG_KMEM("%s: alloc addr %p size %d\n", __func__, (void *)rp, rlen);
974 
975 	/* fill in the known data */
976 	rp->req_num = req->req_num;
977 	rp->msg_type = DR_CPU_STATUS;
978 	rp->num_records = req->num_records;
979 
980 	/* stat array for the response */
981 	stat = DR_CPU_RESP_STATS(rp);
982 
983 	/* get the status for each of the CPUs */
984 	for (idx = 0; idx < req->num_records; idx++) {
985 
986 		result = dr_cpu_status(cpuids[idx], &status);
987 
988 		if (result == DR_CPU_RES_FAILURE)
989 			walk_md = B_TRUE;
990 
991 		/* save off results of the status */
992 		stat[idx].cpuid = cpuids[idx];
993 		stat[idx].result = result;
994 		stat[idx].status = status;
995 	}
996 
997 	if (walk_md == B_FALSE)
998 		goto done;
999 
1000 	/*
1001 	 * At least one of the cpus did not have a CPU
1002 	 * structure. So, consult the MD to determine if
1003 	 * they are present.
1004 	 */
1005 
1006 	if ((mdp = md_get_handle()) == NULL) {
1007 		DR_DBG_CPU("unable to initialize MD\n");
1008 		goto done;
1009 	}
1010 
1011 	num_nodes = md_node_count(mdp);
1012 	ASSERT(num_nodes > 0);
1013 
1014 	listsz = num_nodes * sizeof (mde_cookie_t);
1015 	listp = kmem_zalloc(listsz, KM_SLEEP);
1016 	DR_DBG_KMEM("%s: alloc addr %p size %d\n",
1017 	    __func__, (void *)listp, listsz);
1018 
1019 	for (idx = 0; idx < req->num_records; idx++) {
1020 
1021 		if (stat[idx].result != DR_CPU_RES_FAILURE)
1022 			continue;
1023 
1024 		/* check the MD for the current cpuid */
1025 		cpunode = dr_cpu_find_node_md(stat[idx].cpuid, mdp, listp);
1026 
1027 		stat[idx].result = DR_CPU_RES_OK;
1028 
1029 		if (cpunode == MDE_INVAL_ELEM_COOKIE) {
1030 			stat[idx].status = DR_CPU_STAT_NOT_PRESENT;
1031 		} else {
1032 			stat[idx].status = DR_CPU_STAT_UNCONFIGURED;
1033 		}
1034 	}
1035 
1036 	DR_DBG_KMEM("%s: free addr %p size %d\n",
1037 	    __func__, (void *)listp, listsz);
1038 	kmem_free(listp, listsz);
1039 
1040 	(void) md_fini_handle(mdp);
1041 
1042 done:
1043 	*resp = rp;
1044 	*resp_len = rlen;
1045 
1046 	return (0);
1047 }
1048 
1049 static int
1050 dr_cpu_configure(processorid_t cpuid, int *status, boolean_t force)
1051 {
1052 	 _NOTE(ARGUNUSED(force))
1053 	struct cpu	*cp;
1054 	int		rv = 0;
1055 
1056 	DR_DBG_CPU("dr_cpu_configure...\n");
1057 
1058 	/*
1059 	 * Build device tree node for the CPU
1060 	 */
1061 	if ((rv = dr_cpu_probe(cpuid)) != 0) {
1062 		DR_DBG_CPU("failed to probe CPU %d (%d)\n", cpuid, rv);
1063 		if (rv == EINVAL) {
1064 			*status = DR_CPU_STAT_NOT_PRESENT;
1065 			return (DR_CPU_RES_NOT_IN_MD);
1066 		}
1067 		*status = DR_CPU_STAT_UNCONFIGURED;
1068 		return (DR_CPU_RES_FAILURE);
1069 	}
1070 
1071 	mutex_enter(&cpu_lock);
1072 
1073 	/*
1074 	 * Configure the CPU
1075 	 */
1076 	if ((cp = cpu_get(cpuid)) == NULL) {
1077 
1078 		if ((rv = cpu_configure(cpuid)) != 0) {
1079 			DR_DBG_CPU("failed to configure CPU %d (%d)\n",
1080 			    cpuid, rv);
1081 			rv = DR_CPU_RES_FAILURE;
1082 			*status = DR_CPU_STAT_UNCONFIGURED;
1083 			goto done;
1084 		}
1085 
1086 		DR_DBG_CPU("CPU %d configured\n", cpuid);
1087 
1088 		/* CPU struct should exist now */
1089 		cp = cpu_get(cpuid);
1090 	}
1091 
1092 	ASSERT(cp);
1093 
1094 	/*
1095 	 * Power on the CPU. In sun4v, this brings the stopped
1096 	 * CPU into the guest from the Hypervisor.
1097 	 */
1098 	if (cpu_is_poweredoff(cp)) {
1099 
1100 		if ((rv = cpu_poweron(cp)) != 0) {
1101 			DR_DBG_CPU("failed to power on CPU %d (%d)\n",
1102 			    cpuid, rv);
1103 			rv = DR_CPU_RES_FAILURE;
1104 			*status = DR_CPU_STAT_UNCONFIGURED;
1105 			goto done;
1106 		}
1107 
1108 		DR_DBG_CPU("CPU %d powered on\n", cpuid);
1109 	}
1110 
1111 	/*
1112 	 * Online the CPU
1113 	 */
1114 	if (cpu_is_offline(cp)) {
1115 
1116 		if ((rv = cpu_online(cp)) != 0) {
1117 			DR_DBG_CPU("failed to online CPU %d (%d)\n",
1118 			    cpuid, rv);
1119 			rv = DR_CPU_RES_FAILURE;
1120 			/* offline is still configured */
1121 			*status = DR_CPU_STAT_CONFIGURED;
1122 			goto done;
1123 		}
1124 
1125 		DR_DBG_CPU("CPU %d online\n", cpuid);
1126 	}
1127 
1128 	rv = DR_CPU_RES_OK;
1129 	*status = DR_CPU_STAT_CONFIGURED;
1130 
1131 done:
1132 	mutex_exit(&cpu_lock);
1133 
1134 	return (rv);
1135 }
1136 
1137 static int
1138 dr_cpu_unconfigure(processorid_t cpuid, int *status, boolean_t force)
1139 {
1140 	struct cpu	*cp;
1141 	int		rv = 0;
1142 	int		cpu_flags;
1143 
1144 	DR_DBG_CPU("dr_cpu_unconfigure%s...\n", (force) ? " (force)" : "");
1145 
1146 	mutex_enter(&cpu_lock);
1147 
1148 	cp = cpu_get(cpuid);
1149 
1150 	if (cp == NULL) {
1151 
1152 		/*
1153 		 * The OS CPU structures are already torn down,
1154 		 * Attempt to deprobe the CPU to make sure the
1155 		 * device tree is up to date.
1156 		 */
1157 		if (dr_cpu_deprobe(cpuid) != 0) {
1158 			DR_DBG_CPU("failed to deprobe CPU %d\n", cpuid);
1159 			rv = DR_CPU_RES_FAILURE;
1160 			*status = DR_CPU_STAT_UNCONFIGURED;
1161 			goto done;
1162 		}
1163 
1164 		goto done;
1165 	}
1166 
1167 	ASSERT(cp->cpu_id == cpuid);
1168 
1169 	/*
1170 	 * Offline the CPU
1171 	 */
1172 	if (cpu_is_active(cp)) {
1173 
1174 		/* set the force flag correctly */
1175 		cpu_flags = (force) ? CPU_FORCED : 0;
1176 
1177 		if ((rv = cpu_offline(cp, cpu_flags)) != 0) {
1178 			DR_DBG_CPU("failed to offline CPU %d (%d)\n",
1179 			    cpuid, rv);
1180 
1181 			rv = DR_CPU_RES_FAILURE;
1182 			*status = DR_CPU_STAT_CONFIGURED;
1183 			goto done;
1184 		}
1185 
1186 		DR_DBG_CPU("CPU %d offline\n", cpuid);
1187 	}
1188 
1189 	/*
1190 	 * Power off the CPU. In sun4v, this puts the running
1191 	 * CPU into the stopped state in the Hypervisor.
1192 	 */
1193 	if (!cpu_is_poweredoff(cp)) {
1194 
1195 		if ((rv = cpu_poweroff(cp)) != 0) {
1196 			DR_DBG_CPU("failed to power off CPU %d (%d)\n",
1197 			    cpuid, rv);
1198 			rv = DR_CPU_RES_FAILURE;
1199 			*status = DR_CPU_STAT_CONFIGURED;
1200 			goto done;
1201 		}
1202 
1203 		DR_DBG_CPU("CPU %d powered off\n", cpuid);
1204 	}
1205 
1206 	/*
1207 	 * Unconfigure the CPU
1208 	 */
1209 	if ((rv = cpu_unconfigure(cpuid)) != 0) {
1210 		DR_DBG_CPU("failed to unconfigure CPU %d (%d)\n", cpuid, rv);
1211 		rv = DR_CPU_RES_FAILURE;
1212 		*status = DR_CPU_STAT_UNCONFIGURED;
1213 		goto done;
1214 	}
1215 
1216 	DR_DBG_CPU("CPU %d unconfigured\n", cpuid);
1217 
1218 	/*
1219 	 * Tear down device tree.
1220 	 */
1221 	if ((rv = dr_cpu_deprobe(cpuid)) != 0) {
1222 		DR_DBG_CPU("failed to deprobe CPU %d (%d)\n", cpuid, rv);
1223 		rv = DR_CPU_RES_FAILURE;
1224 		*status = DR_CPU_STAT_UNCONFIGURED;
1225 		goto done;
1226 	}
1227 
1228 	rv = DR_CPU_RES_OK;
1229 	*status = DR_CPU_STAT_UNCONFIGURED;
1230 
1231 done:
1232 	mutex_exit(&cpu_lock);
1233 
1234 	return (rv);
1235 }
1236 
1237 /*
1238  * Determine the state of a CPU. If the CPU structure is not present,
1239  * it does not attempt to determine whether or not the CPU is in the
1240  * MD. It is more efficient to do this at the higher level for all
1241  * CPUs since it may not even be necessary to search the MD if all
1242  * the CPUs are accounted for. Returns DR_CPU_RES_OK if the CPU
1243  * structure is present, and DR_CPU_RES_FAILURE otherwise as a signal
1244  * that an MD walk is necessary.
1245  */
1246 static int
1247 dr_cpu_status(processorid_t cpuid, int *status)
1248 {
1249 	int		rv;
1250 	struct cpu	*cp;
1251 
1252 	DR_DBG_CPU("dr_cpu_status...\n");
1253 
1254 	mutex_enter(&cpu_lock);
1255 
1256 	if ((cp = cpu_get(cpuid)) == NULL) {
1257 		/* need to check if cpu is in the MD */
1258 		rv = DR_CPU_RES_FAILURE;
1259 		goto done;
1260 	}
1261 
1262 	if (cpu_is_poweredoff(cp)) {
1263 		/*
1264 		 * The CPU is powered off, so it is considered
1265 		 * unconfigured from the service entity point of
1266 		 * view. The CPU is not available to the system
1267 		 * and intervention by the service entity would
1268 		 * be required to change that.
1269 		 */
1270 		*status = DR_CPU_STAT_UNCONFIGURED;
1271 	} else {
1272 		/*
1273 		 * The CPU is powered on, so it is considered
1274 		 * configured from the service entity point of
1275 		 * view. It is available for use by the system
1276 		 * and service entities are not concerned about
1277 		 * the operational status (offline, online, etc.)
1278 		 * of the CPU in terms of DR.
1279 		 */
1280 		*status = DR_CPU_STAT_CONFIGURED;
1281 	}
1282 
1283 	rv = DR_CPU_RES_OK;
1284 
1285 done:
1286 	mutex_exit(&cpu_lock);
1287 
1288 	return (rv);
1289 }
1290 
1291 typedef struct {
1292 	md_t		*mdp;
1293 	mde_cookie_t	cpunode;
1294 	dev_info_t	*dip;
1295 } cb_arg_t;
1296 
1297 #define	STR_ARR_LEN	5
1298 
1299 static int
1300 new_cpu_node(dev_info_t *new_node, void *arg, uint_t flags)
1301 {
1302 	_NOTE(ARGUNUSED(flags))
1303 
1304 	char		*compat;
1305 	uint64_t	freq;
1306 	uint64_t	cpuid = 0;
1307 	int		regbuf[4];
1308 	int		len = 0;
1309 	cb_arg_t	*cba;
1310 	char		*str_arr[STR_ARR_LEN];
1311 	char		*curr;
1312 	int		idx = 0;
1313 
1314 	DR_DBG_CPU("new_cpu_node...\n");
1315 
1316 	cba = (cb_arg_t *)arg;
1317 
1318 	/*
1319 	 * Add 'name' property
1320 	 */
1321 	if (ndi_prop_update_string(DDI_DEV_T_NONE, new_node,
1322 	    "name", "cpu") != DDI_SUCCESS) {
1323 		DR_DBG_CPU("new_cpu_node: failed to create 'name' property\n");
1324 		return (DDI_WALK_ERROR);
1325 	}
1326 
1327 	/*
1328 	 * Add 'compatible' property
1329 	 */
1330 	if (md_get_prop_data(cba->mdp, cba->cpunode, "compatible",
1331 	    (uint8_t **)(&compat), &len)) {
1332 		DR_DBG_CPU("new_cpu_node: failed to read 'compatible' property "
1333 		    "from MD\n");
1334 		return (DDI_WALK_ERROR);
1335 	}
1336 
1337 	DR_DBG_CPU("'compatible' len is %d\n", len);
1338 
1339 	/* parse the MD string array */
1340 	curr = compat;
1341 	while (curr < (compat + len)) {
1342 
1343 		DR_DBG_CPU("adding '%s' to 'compatible' property\n", curr);
1344 
1345 		str_arr[idx++] = curr;
1346 		curr += strlen(curr) + 1;
1347 
1348 		if (idx == STR_ARR_LEN) {
1349 			DR_DBG_CPU("exceeded str_arr len (%d)\n", STR_ARR_LEN);
1350 			break;
1351 		}
1352 	}
1353 
1354 	if (ndi_prop_update_string_array(DDI_DEV_T_NONE, new_node,
1355 	    "compatible", str_arr, idx) != DDI_SUCCESS) {
1356 		DR_DBG_CPU("new_cpu_node: failed to create 'compatible' "
1357 		    "property\n");
1358 		return (DDI_WALK_ERROR);
1359 	}
1360 
1361 	/*
1362 	 * Add 'device_type' property
1363 	 */
1364 	if (ndi_prop_update_string(DDI_DEV_T_NONE, new_node,
1365 	    "device_type", "cpu") != DDI_SUCCESS) {
1366 		DR_DBG_CPU("new_cpu_node: failed to create 'device_type' "
1367 		    "property\n");
1368 		return (DDI_WALK_ERROR);
1369 	}
1370 
1371 	/*
1372 	 * Add 'clock-frequency' property
1373 	 */
1374 	if (md_get_prop_val(cba->mdp, cba->cpunode, "clock-frequency", &freq)) {
1375 		DR_DBG_CPU("new_cpu_node: failed to read 'clock-frequency' "
1376 		    "property from MD\n");
1377 		return (DDI_WALK_ERROR);
1378 	}
1379 
1380 	if (ndi_prop_update_int(DDI_DEV_T_NONE, new_node,
1381 	    "clock-frequency", freq) != DDI_SUCCESS) {
1382 		DR_DBG_CPU("new_cpu_node: failed to create 'clock-frequency' "
1383 		    "property\n");
1384 		return (DDI_WALK_ERROR);
1385 	}
1386 
1387 	/*
1388 	 * Add 'reg' (cpuid) property
1389 	 */
1390 	if (md_get_prop_val(cba->mdp, cba->cpunode, "id", &cpuid)) {
1391 		DR_DBG_CPU("new_cpu_node: failed to read 'id' property "
1392 		    "from MD\n");
1393 		return (DDI_WALK_ERROR);
1394 	}
1395 
1396 	DR_DBG_CPU("new cpuid=0x%lx\n", cpuid);
1397 
1398 	bzero(regbuf, 4 * sizeof (int));
1399 	regbuf[0] = 0xc0000000 | cpuid;
1400 
1401 	if (ndi_prop_update_int_array(DDI_DEV_T_NONE, new_node,
1402 	    "reg", regbuf, 4) != DDI_SUCCESS) {
1403 		DR_DBG_CPU("new_cpu_node: failed to create 'reg' property\n");
1404 		return (DDI_WALK_ERROR);
1405 	}
1406 
1407 	cba->dip = new_node;
1408 
1409 	return (DDI_WALK_TERMINATE);
1410 }
1411 
1412 static int
1413 dr_cpu_probe(processorid_t cpuid)
1414 {
1415 	dev_info_t	*pdip;
1416 	dev_info_t	*dip;
1417 	devi_branch_t	br;
1418 	md_t		*mdp = NULL;
1419 	int		num_nodes;
1420 	int		rv = 0;
1421 	int		listsz;
1422 	mde_cookie_t	*listp = NULL;
1423 	cb_arg_t	cba;
1424 	mde_cookie_t	cpunode;
1425 
1426 	if ((dip = dr_cpu_find_node(cpuid)) != NULL) {
1427 		/* nothing to do */
1428 		e_ddi_branch_rele(dip);
1429 		return (0);
1430 	}
1431 
1432 	if ((mdp = md_get_handle()) == NULL) {
1433 		DR_DBG_CPU("unable to initialize machine description\n");
1434 		return (-1);
1435 	}
1436 
1437 	num_nodes = md_node_count(mdp);
1438 	ASSERT(num_nodes > 0);
1439 
1440 	listsz = num_nodes * sizeof (mde_cookie_t);
1441 	listp = kmem_zalloc(listsz, KM_SLEEP);
1442 	DR_DBG_KMEM("%s: alloc addr %p size %d\n",
1443 	    __func__, (void *)listp, listsz);
1444 
1445 	cpunode = dr_cpu_find_node_md(cpuid, mdp, listp);
1446 
1447 	if (cpunode == MDE_INVAL_ELEM_COOKIE) {
1448 		rv = EINVAL;
1449 		goto done;
1450 	}
1451 
1452 	/* pass in MD cookie for CPU */
1453 	cba.mdp = mdp;
1454 	cba.cpunode = cpunode;
1455 
1456 	br.arg = (void *)&cba;
1457 	br.type = DEVI_BRANCH_SID;
1458 	br.create.sid_branch_create = new_cpu_node;
1459 	br.devi_branch_callback = NULL;
1460 	pdip = ddi_root_node();
1461 
1462 	if ((rv = e_ddi_branch_create(pdip, &br, NULL, 0))) {
1463 		DR_DBG_CPU("e_ddi_branch_create failed: %d\n", rv);
1464 		rv = -1;
1465 		goto done;
1466 	}
1467 
1468 	DR_DBG_CPU("CPU %d probed\n", cpuid);
1469 
1470 	rv = 0;
1471 
1472 done:
1473 	if (listp) {
1474 		DR_DBG_KMEM("%s: free addr %p size %d\n",
1475 		    __func__, (void *)listp, listsz);
1476 		kmem_free(listp, listsz);
1477 	}
1478 
1479 	if (mdp)
1480 		(void) md_fini_handle(mdp);
1481 
1482 	return (rv);
1483 }
1484 
1485 static int
1486 dr_cpu_deprobe(processorid_t cpuid)
1487 {
1488 	dev_info_t	*fdip = NULL;
1489 	dev_info_t	*dip;
1490 
1491 	if ((dip = dr_cpu_find_node(cpuid)) == NULL) {
1492 		DR_DBG_CPU("cpuid %d already deprobed\n", cpuid);
1493 		return (0);
1494 	}
1495 
1496 	ASSERT(e_ddi_branch_held(dip));
1497 
1498 	if (e_ddi_branch_destroy(dip, &fdip, 0)) {
1499 		char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1500 
1501 		DR_DBG_KMEM("%s: alloc addr %p size %d\n",
1502 		    __func__, (void *)path, MAXPATHLEN);
1503 		/*
1504 		 * If non-NULL, fdip is held and must be released.
1505 		 */
1506 		if (fdip != NULL) {
1507 			(void) ddi_pathname(fdip, path);
1508 			ddi_release_devi(fdip);
1509 		} else {
1510 			(void) ddi_pathname(dip, path);
1511 		}
1512 		cmn_err(CE_NOTE, "node removal failed: %s (%p)",
1513 		    path, (fdip) ? (void *)fdip : (void *)dip);
1514 
1515 		DR_DBG_KMEM("%s: free addr %p size %d\n",
1516 		    __func__, (void *)path, MAXPATHLEN);
1517 		kmem_free(path, MAXPATHLEN);
1518 
1519 		return (-1);
1520 	}
1521 
1522 	DR_DBG_CPU("CPU %d deprobed\n", cpuid);
1523 
1524 	return (0);
1525 }
1526 
1527 typedef struct {
1528 	processorid_t	cpuid;
1529 	dev_info_t	*dip;
1530 } dr_search_arg_t;
1531 
1532 static int
1533 dr_cpu_check_node(dev_info_t *dip, void *arg)
1534 {
1535 	char 		*name;
1536 	processorid_t	cpuid;
1537 	dr_search_arg_t	*sarg = (dr_search_arg_t *)arg;
1538 
1539 	if (dip == ddi_root_node()) {
1540 		return (DDI_WALK_CONTINUE);
1541 	}
1542 
1543 	name = ddi_node_name(dip);
1544 
1545 	if (strcmp(name, "cpu") != 0) {
1546 		return (DDI_WALK_PRUNECHILD);
1547 	}
1548 
1549 	cpuid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
1550 	    "reg", -1);
1551 
1552 	cpuid = PROM_CFGHDL_TO_CPUID(cpuid);
1553 
1554 	DR_DBG_CPU("found cpuid=0x%x, looking for 0x%x\n", cpuid, sarg->cpuid);
1555 
1556 	if (cpuid == sarg->cpuid) {
1557 		DR_DBG_CPU("matching node\n");
1558 
1559 		/* matching node must be returned held */
1560 		if (!e_ddi_branch_held(dip))
1561 			e_ddi_branch_hold(dip);
1562 
1563 		sarg->dip = dip;
1564 		return (DDI_WALK_TERMINATE);
1565 	}
1566 
1567 	return (DDI_WALK_CONTINUE);
1568 }
1569 
1570 /*
1571  * Walk the device tree to find the dip corresponding to the cpuid
1572  * passed in. If present, the dip is returned held. The caller must
1573  * release the hold on the dip once it is no longer required. If no
1574  * matching node if found, NULL is returned.
1575  */
1576 static dev_info_t *
1577 dr_cpu_find_node(processorid_t cpuid)
1578 {
1579 	dr_search_arg_t	arg;
1580 
1581 	DR_DBG_CPU("dr_cpu_find_node...\n");
1582 
1583 	arg.cpuid = cpuid;
1584 	arg.dip = NULL;
1585 
1586 	ddi_walk_devs(ddi_root_node(), dr_cpu_check_node, &arg);
1587 
1588 	ASSERT((arg.dip == NULL) || (e_ddi_branch_held(arg.dip)));
1589 
1590 	return ((arg.dip) ? arg.dip : NULL);
1591 }
1592 
1593 /*
1594  * Look up a particular cpuid in the MD. Returns the mde_cookie_t
1595  * representing that CPU if present, and MDE_INVAL_ELEM_COOKIE
1596  * otherwise. It is assumed the scratch array has already been
1597  * allocated so that it can accommodate the worst case scenario,
1598  * every node in the MD.
1599  */
1600 static mde_cookie_t
1601 dr_cpu_find_node_md(processorid_t cpuid, md_t *mdp, mde_cookie_t *listp)
1602 {
1603 	int		idx;
1604 	int		nnodes;
1605 	mde_cookie_t	rootnode;
1606 	uint64_t	cpuid_prop;
1607 	mde_cookie_t	result = MDE_INVAL_ELEM_COOKIE;
1608 
1609 	rootnode = md_root_node(mdp);
1610 	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
1611 
1612 	/*
1613 	 * Scan the DAG for all the CPU nodes
1614 	 */
1615 	nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "cpu"),
1616 	    md_find_name(mdp, "fwd"), listp);
1617 
1618 	if (nnodes < 0) {
1619 		DR_DBG_CPU("Scan for CPUs failed\n");
1620 		return (result);
1621 	}
1622 
1623 	DR_DBG_CPU("dr_cpu_find_node_md: found %d CPUs in the MD\n", nnodes);
1624 
1625 	/*
1626 	 * Find the CPU of interest
1627 	 */
1628 	for (idx = 0; idx < nnodes; idx++) {
1629 
1630 		if (md_get_prop_val(mdp, listp[idx], "id", &cpuid_prop)) {
1631 			DR_DBG_CPU("Missing 'id' property for CPU node %d\n",
1632 			    idx);
1633 			break;
1634 		}
1635 
1636 		if (cpuid_prop == cpuid) {
1637 			/* found a match */
1638 			DR_DBG_CPU("dr_cpu_find_node_md: found CPU %d "
1639 			    "in MD\n", cpuid);
1640 			result = listp[idx];
1641 			break;
1642 		}
1643 	}
1644 
1645 	if (result == MDE_INVAL_ELEM_COOKIE) {
1646 		DR_DBG_CPU("CPU %d not in MD\n", cpuid);
1647 	}
1648 
1649 	return (result);
1650 }
1651