xref: /illumos-gate/usr/src/uts/sun4v/io/dr_cpu.c (revision c5806743f70246f7f708e57514b9103a6291d629)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Copyright 2019 Joyent, Inc.
29  */
30 
31 /*
32  * sun4v CPU DR Module
33  */
34 
35 #include <sys/modctl.h>
36 #include <sys/processor.h>
37 #include <sys/cpuvar.h>
38 #include <sys/cpupart.h>
39 #include <sys/sunddi.h>
40 #include <sys/sunndi.h>
41 #include <sys/note.h>
42 #include <sys/sysevent/dr.h>
43 #include <sys/hypervisor_api.h>
44 #include <sys/mach_descrip.h>
45 #include <sys/mdesc.h>
46 #include <sys/ds.h>
47 #include <sys/drctl.h>
48 #include <sys/dr_util.h>
49 #include <sys/dr_cpu.h>
50 #include <sys/promif.h>
51 #include <sys/machsystm.h>
52 
53 
54 static struct modlmisc modlmisc = {
55 	&mod_miscops,
56 	"sun4v CPU DR"
57 };
58 
59 static struct modlinkage modlinkage = {
60 	MODREV_1,
61 	(void *)&modlmisc,
62 	NULL
63 };
64 
65 typedef int (*fn_t)(processorid_t, int *, boolean_t);
66 
67 /*
68  * Global DS Handle
69  */
70 static ds_svc_hdl_t ds_handle;
71 
72 /*
73  * Supported DS Capability Versions
74  */
75 static ds_ver_t		dr_cpu_vers[] = { { 1, 1 }, { 1, 0 } };
76 #define	DR_CPU_NVERS	(sizeof (dr_cpu_vers) / sizeof (dr_cpu_vers[0]))
77 
78 static ds_ver_t		version;
79 
80 /*
81  * DS Capability Description
82  */
83 static ds_capability_t dr_cpu_cap = {
84 	DR_CPU_DS_ID,		/* svc_id */
85 	dr_cpu_vers,		/* vers */
86 	DR_CPU_NVERS		/* nvers */
87 };
88 
89 #define	DRCPU_VERS_EQ(_maj, _min) \
90 	((version.major == (_maj)) && (version.minor == (_min)))
91 
92 #define	DRCPU_VERS_GTEQ(_maj, _min) \
93 	((version.major > (_maj)) ||					\
94 	((version.major == (_maj)) && (version.minor >= (_min))))
95 
96 /*
97  * DS Callbacks
98  */
99 static void dr_cpu_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
100 static void dr_cpu_unreg_handler(ds_cb_arg_t arg);
101 static void dr_cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
102 
103 /*
104  * DS Client Ops Vector
105  */
106 static ds_clnt_ops_t dr_cpu_ops = {
107 	dr_cpu_reg_handler,	/* ds_reg_cb */
108 	dr_cpu_unreg_handler,	/* ds_unreg_cb */
109 	dr_cpu_data_handler,	/* ds_data_cb */
110 	NULL			/* cb_arg */
111 };
112 
113 /*
114  * Operation Results
115  *
116  * Used internally to gather results while an operation on a
117  * list of CPUs is in progress. In particular, it is used to
118  * keep track of which CPUs have already failed so that they are
119  * not processed further, and the manner in which they failed.
120  */
121 typedef struct {
122 	uint32_t	cpuid;
123 	uint32_t	result;
124 	uint32_t	status;
125 	char		*string;
126 } dr_cpu_res_t;
127 
128 #define	DR_CPU_MAX_ERR_LEN	64	/* maximum error string length */
129 
130 /*
131  * Internal Functions
132  */
133 static int dr_cpu_init(void);
134 static int dr_cpu_fini(void);
135 
136 static int dr_cpu_list_wrk(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *);
137 static int dr_cpu_list_status(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *);
138 
139 static int dr_cpu_unconfigure(processorid_t, int *status, boolean_t force);
140 static int dr_cpu_configure(processorid_t, int *status, boolean_t force);
141 static int dr_cpu_status(processorid_t, int *status);
142 
143 static void dr_cpu_check_cpus(dr_cpu_hdr_t *req, dr_cpu_res_t *res);
144 static void dr_cpu_check_psrset(uint32_t *cpuids, dr_cpu_res_t *res, int nres);
145 static int dr_cpu_check_bound_thr(cpu_t *cp, dr_cpu_res_t *res);
146 
147 static dr_cpu_res_t *dr_cpu_res_array_init(dr_cpu_hdr_t *, drctl_rsrc_t *, int);
148 static void dr_cpu_res_array_fini(dr_cpu_res_t *res, int nres);
149 static size_t dr_cpu_pack_response(dr_cpu_hdr_t *req, dr_cpu_res_t *res,
150     dr_cpu_hdr_t **respp);
151 
152 static int dr_cpu_probe(processorid_t newcpuid);
153 static int dr_cpu_deprobe(processorid_t cpuid);
154 
155 static dev_info_t *dr_cpu_find_node(processorid_t cpuid);
156 static mde_cookie_t dr_cpu_find_node_md(processorid_t, md_t *, mde_cookie_t *);
157 
158 int
159 _init(void)
160 {
161 	int	status;
162 
163 	/* check that CPU DR is enabled */
164 	if (dr_is_disabled(DR_TYPE_CPU)) {
165 		cmn_err(CE_CONT, "!CPU DR is disabled\n");
166 		return (-1);
167 	}
168 
169 	if ((status = dr_cpu_init()) != 0) {
170 		cmn_err(CE_NOTE, "CPU DR initialization failed");
171 		return (status);
172 	}
173 
174 	if ((status = mod_install(&modlinkage)) != 0) {
175 		(void) dr_cpu_fini();
176 	}
177 
178 	return (status);
179 }
180 
181 int
182 _info(struct modinfo *modinfop)
183 {
184 	return (mod_info(&modlinkage, modinfop));
185 }
186 
187 int dr_cpu_allow_unload;
188 
189 int
190 _fini(void)
191 {
192 	int	status;
193 
194 	if (dr_cpu_allow_unload == 0)
195 		return (EBUSY);
196 
197 	if ((status = mod_remove(&modlinkage)) == 0) {
198 		(void) dr_cpu_fini();
199 	}
200 
201 	return (status);
202 }
203 
204 static int
205 dr_cpu_init(void)
206 {
207 	int	rv;
208 
209 	if ((rv = ds_cap_init(&dr_cpu_cap, &dr_cpu_ops)) != 0) {
210 		cmn_err(CE_NOTE, "ds_cap_init failed: %d", rv);
211 		return (-1);
212 	}
213 
214 	return (0);
215 }
216 
217 static int
218 dr_cpu_fini(void)
219 {
220 	int	rv;
221 
222 	if ((rv = ds_cap_fini(&dr_cpu_cap)) != 0) {
223 		cmn_err(CE_NOTE, "ds_cap_fini failed: %d", rv);
224 		return (-1);
225 	}
226 
227 	return (0);
228 }
229 
230 static void
231 dr_cpu_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
232 {
233 	DR_DBG_CPU("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg,
234 	    ver->major, ver->minor, hdl);
235 
236 	version.major = ver->major;
237 	version.minor = ver->minor;
238 	ds_handle = hdl;
239 }
240 
241 static void
242 dr_cpu_unreg_handler(ds_cb_arg_t arg)
243 {
244 	DR_DBG_CPU("unreg_handler: arg=0x%p\n", arg);
245 
246 	ds_handle = DS_INVALID_HDL;
247 }
248 
249 static void
250 dr_cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
251 {
252 	_NOTE(ARGUNUSED(arg))
253 
254 	dr_cpu_hdr_t	*req = buf;
255 	dr_cpu_hdr_t	err_resp;
256 	dr_cpu_hdr_t	*resp = &err_resp;
257 	int		resp_len = 0;
258 	int		rv;
259 
260 	/*
261 	 * Sanity check the message
262 	 */
263 	if (buflen < sizeof (dr_cpu_hdr_t)) {
264 		DR_DBG_CPU("incoming message short: expected at least %ld "
265 		    "bytes, received %ld\n", sizeof (dr_cpu_hdr_t), buflen);
266 		goto done;
267 	}
268 
269 	if (req == NULL) {
270 		DR_DBG_CPU("empty message: expected at least %ld bytes\n",
271 		    sizeof (dr_cpu_hdr_t));
272 		goto done;
273 	}
274 
275 	DR_DBG_CPU("incoming request:\n");
276 	DR_DBG_DUMP_MSG(buf, buflen);
277 
278 	if (req->num_records > NCPU) {
279 		DR_DBG_CPU("CPU list too long: %d when %d is the maximum\n",
280 		    req->num_records, NCPU);
281 		goto done;
282 	}
283 
284 	if (req->num_records == 0) {
285 		DR_DBG_CPU("No CPU specified for operation\n");
286 		goto done;
287 	}
288 
289 	/*
290 	 * Process the command
291 	 */
292 	switch (req->msg_type) {
293 	case DR_CPU_CONFIGURE:
294 	case DR_CPU_UNCONFIGURE:
295 	case DR_CPU_FORCE_UNCONFIG:
296 		if ((rv = dr_cpu_list_wrk(req, &resp, &resp_len)) != 0) {
297 			DR_DBG_CPU("%s%s failed (%d)\n",
298 			    (req->msg_type == DR_CPU_CONFIGURE) ?
299 			    "CPU configure" : "CPU unconfigure",
300 			    (req->msg_type == DR_CPU_FORCE_UNCONFIG) ?
301 			    " (forced)" : "", rv);
302 		}
303 		break;
304 
305 	case DR_CPU_STATUS:
306 		if ((rv = dr_cpu_list_status(req, &resp, &resp_len)) != 0)
307 			DR_DBG_CPU("CPU status failed (%d)\n", rv);
308 		break;
309 
310 	default:
311 		cmn_err(CE_NOTE, "unsupported DR operation (%d)",
312 		    req->msg_type);
313 		break;
314 	}
315 
316 done:
317 	/* check if an error occurred */
318 	if (resp == &err_resp) {
319 		resp->req_num = (req) ? req->req_num : 0;
320 		resp->msg_type = DR_CPU_ERROR;
321 		resp->num_records = 0;
322 		resp_len = sizeof (dr_cpu_hdr_t);
323 	}
324 
325 	DR_DBG_CPU("outgoing response:\n");
326 	DR_DBG_DUMP_MSG(resp, resp_len);
327 
328 	/* send back the response */
329 	if (ds_cap_send(ds_handle, resp, resp_len) != 0) {
330 		DR_DBG_CPU("ds_send failed\n");
331 	}
332 
333 	/* free any allocated memory */
334 	if (DRCPU_VERS_GTEQ(1, 1) || (resp != &err_resp)) {
335 		DR_DBG_KMEM("%s: free addr %p size %d\n",
336 		    __func__, (void *)resp, resp_len);
337 		kmem_free(resp, resp_len);
338 	}
339 }
340 
341 /*
342  * Create a response message which consists of a header followed
343  * by the error string passed in.
344  */
345 static size_t
346 dr_cpu_err_resp(dr_cpu_hdr_t *req, dr_cpu_hdr_t **respp, char *msg)
347 {
348 	size_t size;
349 	dr_cpu_hdr_t *resp;
350 
351 	ASSERT((msg != NULL) && (strlen(msg) > 0));
352 
353 	size = sizeof (*req) + strlen(msg) + 1;
354 	resp = kmem_alloc(size, KM_SLEEP);
355 	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
356 	    __func__, (void *)resp, size);
357 
358 	resp->req_num = req->req_num;
359 	resp->msg_type = DR_CPU_ERROR;
360 	resp->num_records = 0;
361 
362 	(void) strcpy((char *)(resp) + sizeof (*resp), msg);
363 
364 	*respp = resp;
365 
366 	return (size);
367 }
368 
369 /*
370  * Common routine to config or unconfig multiple cpus.  The unconfig
371  * case checks with the OS to see if the removal of cpus will be
372  * permitted, but can be overridden by the "force" version of the
373  * command.  Otherwise, the logic for both cases is identical.
374  *
375  * Note: Do not modify result buffer or length on error.
376  */
377 static int
378 dr_cpu_list_wrk(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len)
379 {
380 	int		rv;
381 	int		idx;
382 	int		count;
383 	fn_t		dr_fn;
384 	int		se_hint;
385 	boolean_t	force = B_FALSE;
386 	uint32_t	*req_cpus;
387 	dr_cpu_res_t	*res;
388 	int		drctl_cmd;
389 	int		drctl_flags = 0;
390 	drctl_rsrc_t	*drctl_req;
391 	size_t		drctl_req_len;
392 	drctl_resp_t	*drctl_resp;
393 	drctl_rsrc_t	*drctl_rsrc;
394 	size_t		drctl_resp_len = 0;
395 	drctl_cookie_t	drctl_res_ck;
396 
397 	ASSERT((req != NULL) && (req->num_records != 0));
398 
399 	count = req->num_records;
400 
401 	/*
402 	 * Extract all information that is specific
403 	 * to the various types of operations.
404 	 */
405 	switch (req->msg_type) {
406 	case DR_CPU_CONFIGURE:
407 		dr_fn = dr_cpu_configure;
408 		drctl_cmd = DRCTL_CPU_CONFIG_REQUEST;
409 		se_hint = SE_HINT_INSERT;
410 		break;
411 	case DR_CPU_FORCE_UNCONFIG:
412 		drctl_flags = DRCTL_FLAG_FORCE;
413 		force = B_TRUE;
414 		_NOTE(FALLTHROUGH)
415 	case DR_CPU_UNCONFIGURE:
416 		dr_fn = dr_cpu_unconfigure;
417 		drctl_cmd = DRCTL_CPU_UNCONFIG_REQUEST;
418 		se_hint = SE_HINT_REMOVE;
419 		break;
420 	default:
421 		/* Programming error if we reach this. */
422 		cmn_err(CE_NOTE,
423 		    "%s: bad msg_type %d\n", __func__, req->msg_type);
424 		ASSERT(0);
425 		return (-1);
426 	}
427 
428 	/* the incoming array of cpuids to operate on */
429 	req_cpus = DR_CPU_CMD_CPUIDS(req);
430 
431 	/* allocate drctl request msg based on incoming resource count */
432 	drctl_req_len = sizeof (drctl_rsrc_t) * count;
433 	drctl_req = kmem_zalloc(drctl_req_len, KM_SLEEP);
434 	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
435 	    __func__, (void *)drctl_req, drctl_req_len);
436 
437 	/* copy the cpuids for the drctl call from the incoming request msg */
438 	for (idx = 0; idx < count; idx++)
439 		drctl_req[idx].res_cpu_id = req_cpus[idx];
440 
441 	rv = drctl_config_init(drctl_cmd, drctl_flags, drctl_req,
442 	    count, &drctl_resp, &drctl_resp_len, &drctl_res_ck);
443 
444 	ASSERT((drctl_resp != NULL) && (drctl_resp_len != 0));
445 
446 	if (rv != 0) {
447 		DR_DBG_CPU("%s: drctl_config_init "
448 		    "returned: %d\n", __func__, rv);
449 
450 		if (DRCPU_VERS_EQ(1, 0)) {
451 			rv = -1;
452 		} else {
453 			ASSERT(DRCPU_VERS_GTEQ(1, 1));
454 			ASSERT(drctl_resp->resp_type == DRCTL_RESP_ERR);
455 
456 			*resp_len = dr_cpu_err_resp(req,
457 			    resp, drctl_resp->resp_err_msg);
458 		}
459 
460 		DR_DBG_KMEM("%s: free addr %p size %ld\n",
461 		    __func__, (void *)drctl_resp, drctl_resp_len);
462 		kmem_free(drctl_resp, drctl_resp_len);
463 		DR_DBG_KMEM("%s: free addr %p size %ld\n",
464 		    __func__, (void *)drctl_req, drctl_req_len);
465 		kmem_free(drctl_req, drctl_req_len);
466 
467 		return (rv);
468 	}
469 
470 	ASSERT(drctl_resp->resp_type == DRCTL_RESP_OK);
471 
472 	drctl_rsrc = drctl_resp->resp_resources;
473 
474 	/* create the result scratch array */
475 	res = dr_cpu_res_array_init(req, drctl_rsrc, count);
476 
477 	/*
478 	 * For unconfigure, check if there are any conditions
479 	 * that will cause the operation to fail. These are
480 	 * performed before the actual unconfigure attempt so
481 	 * that a meaningful error message can be generated.
482 	 */
483 	if (req->msg_type != DR_CPU_CONFIGURE)
484 		dr_cpu_check_cpus(req, res);
485 
486 	/* perform the specified operation on each of the CPUs */
487 	for (idx = 0; idx < count; idx++) {
488 		int result;
489 		int status;
490 
491 		/*
492 		 * If no action will be taken against the current
493 		 * CPU, update the drctl resource information to
494 		 * ensure that it gets recovered properly during
495 		 * the drctl fini() call.
496 		 */
497 		if (res[idx].result != DR_CPU_RES_OK) {
498 			drctl_req[idx].status = DRCTL_STATUS_CONFIG_FAILURE;
499 			continue;
500 		}
501 
502 		/* call the function to perform the actual operation */
503 		result = (*dr_fn)(req_cpus[idx], &status, force);
504 
505 		/* save off results of the operation */
506 		res[idx].result = result;
507 		res[idx].status = status;
508 
509 		/* save result for drctl fini() reusing init() msg memory */
510 		drctl_req[idx].status = (result != DR_CPU_RES_OK) ?
511 		    DRCTL_STATUS_CONFIG_FAILURE : DRCTL_STATUS_CONFIG_SUCCESS;
512 
513 		DR_DBG_CPU("%s: cpuid %d status %d result %d off '%s'\n",
514 		    __func__, req_cpus[idx], drctl_req[idx].status, result,
515 		    (res[idx].string) ? res[idx].string : "");
516 	}
517 
518 	if ((rv = drctl_config_fini(&drctl_res_ck, drctl_req, count)) != 0)
519 		DR_DBG_CPU("%s: drctl_config_fini "
520 		    "returned: %d\n", __func__, rv);
521 
522 	/*
523 	 * Operation completed without any fatal errors.
524 	 * Pack the response for transmission.
525 	 */
526 	*resp_len = dr_cpu_pack_response(req, res, resp);
527 
528 	/* notify interested parties about the operation */
529 	dr_generate_event(DR_TYPE_CPU, se_hint);
530 
531 	/*
532 	 * Deallocate any scratch memory.
533 	 */
534 	DR_DBG_KMEM("%s: free addr %p size %ld\n",
535 	    __func__, (void *)drctl_resp, drctl_resp_len);
536 	kmem_free(drctl_resp, drctl_resp_len);
537 	DR_DBG_KMEM("%s: free addr %p size %ld\n",
538 	    __func__, (void *)drctl_req, drctl_req_len);
539 	kmem_free(drctl_req, drctl_req_len);
540 
541 	dr_cpu_res_array_fini(res, count);
542 
543 	return (0);
544 }
545 
546 /*
547  * Allocate and initialize a result array based on the initial
548  * drctl operation. A valid result array is always returned.
549  */
550 static dr_cpu_res_t *
551 dr_cpu_res_array_init(dr_cpu_hdr_t *req, drctl_rsrc_t *rsrc, int nrsrc)
552 {
553 	int		idx;
554 	dr_cpu_res_t	*res;
555 	char		*err_str;
556 	size_t		err_len;
557 
558 	/* allocate zero filled buffer to initialize fields */
559 	res = kmem_zalloc(nrsrc * sizeof (dr_cpu_res_t), KM_SLEEP);
560 	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
561 	    __func__, (void *)res, nrsrc * sizeof (dr_cpu_res_t));
562 
563 	/*
564 	 * Fill in the result information for each resource.
565 	 */
566 	for (idx = 0; idx < nrsrc; idx++) {
567 		res[idx].cpuid = rsrc[idx].res_cpu_id;
568 		res[idx].result = DR_CPU_RES_OK;
569 
570 		if (rsrc[idx].status == DRCTL_STATUS_ALLOW)
571 			continue;
572 
573 		/*
574 		 * Update the state information for this CPU.
575 		 */
576 		res[idx].result = DR_CPU_RES_BLOCKED;
577 		res[idx].status = (req->msg_type == DR_CPU_CONFIGURE) ?
578 		    DR_CPU_STAT_UNCONFIGURED : DR_CPU_STAT_CONFIGURED;
579 
580 		/*
581 		 * If an error string exists, copy it out of the
582 		 * message buffer. This eliminates any dependency
583 		 * on the memory allocated for the message buffer
584 		 * itself.
585 		 */
586 		if (rsrc[idx].offset != NULL) {
587 			err_str = (char *)rsrc + rsrc[idx].offset;
588 			err_len = strlen(err_str) + 1;
589 
590 			res[idx].string = kmem_alloc(err_len, KM_SLEEP);
591 			DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
592 			    __func__, (void *)(res[idx].string), err_len);
593 			bcopy(err_str, res[idx].string, err_len);
594 		}
595 	}
596 
597 	return (res);
598 }
599 
600 static void
601 dr_cpu_res_array_fini(dr_cpu_res_t *res, int nres)
602 {
603 	int	idx;
604 	size_t	str_len;
605 
606 	for (idx = 0; idx < nres; idx++) {
607 		/* deallocate the error string if present */
608 		if (res[idx].string) {
609 			str_len = strlen(res[idx].string) + 1;
610 			DR_DBG_KMEM("%s: free addr %p size %ld\n",
611 			    __func__, (void *)(res[idx].string), str_len);
612 			kmem_free(res[idx].string, str_len);
613 		}
614 	}
615 
616 	/* deallocate the result array itself */
617 	DR_DBG_KMEM("%s: free addr %p size %ld\n",
618 	    __func__, (void *)res, sizeof (dr_cpu_res_t) * nres);
619 	kmem_free(res, sizeof (dr_cpu_res_t) * nres);
620 }
621 
622 /*
623  * Allocate and pack a response message for transmission based
624  * on the specified result array. A valid response message and
625  * valid size information is always returned.
626  */
627 static size_t
628 dr_cpu_pack_response(dr_cpu_hdr_t *req, dr_cpu_res_t *res, dr_cpu_hdr_t **respp)
629 {
630 	int		idx;
631 	dr_cpu_hdr_t	*resp;
632 	dr_cpu_stat_t	*resp_stat;
633 	size_t		resp_len;
634 	uint32_t	curr_off;
635 	caddr_t		curr_str;
636 	size_t		str_len;
637 	size_t		stat_len;
638 	int		nstat = req->num_records;
639 
640 	/*
641 	 * Calculate the size of the response message
642 	 * and allocate an appropriately sized buffer.
643 	 */
644 	resp_len = 0;
645 
646 	/* add the header size */
647 	resp_len += sizeof (dr_cpu_hdr_t);
648 
649 	/* add the stat array size */
650 	stat_len = sizeof (dr_cpu_stat_t) * nstat;
651 	resp_len += stat_len;
652 
653 	/* add the size of any error strings */
654 	for (idx = 0; idx < nstat; idx++) {
655 		if (res[idx].string != NULL) {
656 			resp_len += strlen(res[idx].string) + 1;
657 		}
658 	}
659 
660 	/* allocate the message buffer */
661 	resp = kmem_zalloc(resp_len, KM_SLEEP);
662 	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
663 	    __func__, (void *)resp, resp_len);
664 
665 	/*
666 	 * Fill in the header information.
667 	 */
668 	resp->req_num = req->req_num;
669 	resp->msg_type = DR_CPU_OK;
670 	resp->num_records = nstat;
671 
672 	/*
673 	 * Fill in the stat information.
674 	 */
675 	resp_stat = DR_CPU_RESP_STATS(resp);
676 
677 	/* string offsets start immediately after stat array */
678 	curr_off = sizeof (dr_cpu_hdr_t) + stat_len;
679 	curr_str = (char *)resp_stat + stat_len;
680 
681 	for (idx = 0; idx < nstat; idx++) {
682 		resp_stat[idx].cpuid = res[idx].cpuid;
683 		resp_stat[idx].result = res[idx].result;
684 		resp_stat[idx].status = res[idx].status;
685 
686 		if (res[idx].string != NULL) {
687 			/* copy over the error string */
688 			str_len = strlen(res[idx].string) + 1;
689 			bcopy(res[idx].string, curr_str, str_len);
690 			resp_stat[idx].string_off = curr_off;
691 
692 			curr_off += str_len;
693 			curr_str += str_len;
694 		}
695 	}
696 
697 	/* buffer should be exactly filled */
698 	ASSERT(curr_off == resp_len);
699 
700 	*respp = resp;
701 	return (resp_len);
702 }
703 
704 /*
705  * Check for conditions that will prevent a CPU from being offlined.
706  * This provides the opportunity to generate useful information to
707  * help diagnose the failure rather than letting the offline attempt
708  * fail in a more generic way.
709  */
710 static void
711 dr_cpu_check_cpus(dr_cpu_hdr_t *req, dr_cpu_res_t *res)
712 {
713 	int		idx;
714 	cpu_t		*cp;
715 	uint32_t	*cpuids;
716 
717 	ASSERT((req->msg_type == DR_CPU_UNCONFIGURE) ||
718 	    (req->msg_type == DR_CPU_FORCE_UNCONFIG));
719 
720 	DR_DBG_CPU("dr_cpu_check_cpus...\n");
721 
722 	/* array of cpuids start just after the header */
723 	cpuids = DR_CPU_CMD_CPUIDS(req);
724 
725 	mutex_enter(&cpu_lock);
726 
727 	/*
728 	 * Always check processor set membership first. The
729 	 * last CPU in a processor set will fail to offline
730 	 * even if the operation if forced, so any failures
731 	 * should always be reported.
732 	 */
733 	dr_cpu_check_psrset(cpuids, res, req->num_records);
734 
735 	/* process each cpu that is part of the request */
736 	for (idx = 0; idx < req->num_records; idx++) {
737 
738 		/* nothing to check if the CPU has already failed */
739 		if (res[idx].result != DR_CPU_RES_OK)
740 			continue;
741 
742 		if ((cp = cpu_get(cpuids[idx])) == NULL)
743 			continue;
744 
745 		/*
746 		 * Only check if there are bound threads if the
747 		 * operation is not a forced unconfigure. In a
748 		 * forced request, threads are automatically
749 		 * unbound before they are offlined.
750 		 */
751 		if (req->msg_type == DR_CPU_UNCONFIGURE) {
752 			/*
753 			 * The return value is only interesting if other
754 			 * checks are added to this loop and a decision
755 			 * is needed on whether to continue checking.
756 			 */
757 			(void) dr_cpu_check_bound_thr(cp, &res[idx]);
758 		}
759 	}
760 
761 	mutex_exit(&cpu_lock);
762 }
763 
764 /*
765  * Examine the processor set configuration for the specified
766  * CPUs and see if the unconfigure operation would result in
767  * trying to remove the last CPU in any processor set.
768  */
769 static void
770 dr_cpu_check_psrset(uint32_t *cpuids, dr_cpu_res_t *res, int nres)
771 {
772 	int		cpu_idx;
773 	int		set_idx;
774 	cpu_t		*cp;
775 	cpupart_t	*cpp;
776 	char		err_str[DR_CPU_MAX_ERR_LEN];
777 	size_t		err_len;
778 	struct {
779 		cpupart_t	*cpp;
780 		int		ncpus;
781 	} *psrset;
782 
783 	ASSERT(MUTEX_HELD(&cpu_lock));
784 
785 	/*
786 	 * Allocate a scratch array to count the CPUs in
787 	 * the various processor sets. A CPU always belongs
788 	 * to exactly one processor set, so by definition,
789 	 * the scratch array never needs to be larger than
790 	 * the number of CPUs.
791 	 */
792 	psrset = kmem_zalloc(sizeof (*psrset) * nres, KM_SLEEP);
793 	DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
794 	    __func__, (void *)psrset, sizeof (*psrset) * nres);
795 
796 	for (cpu_idx = 0; cpu_idx < nres; cpu_idx++) {
797 
798 		/* skip any CPUs that have already failed */
799 		if (res[cpu_idx].result != DR_CPU_RES_OK)
800 			continue;
801 
802 		if ((cp = cpu_get(cpuids[cpu_idx])) == NULL)
803 			continue;
804 
805 		cpp = cp->cpu_part;
806 
807 		/* lookup the set this CPU belongs to */
808 		for (set_idx = 0; set_idx < nres; set_idx++) {
809 
810 			/* matching set found */
811 			if (cpp == psrset[set_idx].cpp)
812 				break;
813 
814 			/* set not found, start a new entry */
815 			if (psrset[set_idx].cpp == NULL) {
816 				psrset[set_idx].cpp = cpp;
817 				psrset[set_idx].ncpus = cpp->cp_ncpus;
818 				break;
819 			}
820 		}
821 
822 		ASSERT(set_idx != nres);
823 
824 		/*
825 		 * Remove the current CPU from the set total but only
826 		 * generate an error for the last CPU. The correct CPU
827 		 * will get the error because the unconfigure attempts
828 		 * will occur in the same order in which the CPUs are
829 		 * examined in this loop.  The cp_ncpus field of a
830 		 * cpupart_t counts only online cpus, so it is safe
831 		 * to remove an offline cpu without testing ncpus.
832 		 */
833 		if (cpu_is_offline(cp))
834 			continue;
835 
836 		if (--psrset[set_idx].ncpus == 0) {
837 			/*
838 			 * Fill in the various pieces of information
839 			 * to report that the operation will fail.
840 			 */
841 			res[cpu_idx].result = DR_CPU_RES_BLOCKED;
842 			res[cpu_idx].status = DR_CPU_STAT_CONFIGURED;
843 
844 			(void) snprintf(err_str, DR_CPU_MAX_ERR_LEN,
845 			    "last online cpu in processor set %d", cpp->cp_id);
846 
847 			err_len = strlen(err_str) + 1;
848 
849 			res[cpu_idx].string = kmem_alloc(err_len, KM_SLEEP);
850 			DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
851 			    __func__, (void *)(res[cpu_idx].string), err_len);
852 			bcopy(err_str, res[cpu_idx].string, err_len);
853 
854 			DR_DBG_CPU("cpu %d: %s\n", cpuids[cpu_idx], err_str);
855 		}
856 	}
857 
858 	DR_DBG_KMEM("%s: free addr %p size %ld\n",
859 	    __func__, (void *)psrset, sizeof (*psrset) * nres);
860 	kmem_free(psrset, sizeof (*psrset) * nres);
861 }
862 
863 /*
864  * Check if any threads are bound to the specified CPU. If the
865  * condition is true, DR_CPU_RES_BLOCKED is returned and an error
866  * string is generated and placed in the specified result structure.
867  * Otherwise, DR_CPU_RES_OK is returned.
868  */
869 static int
870 dr_cpu_check_bound_thr(cpu_t *cp, dr_cpu_res_t *res)
871 {
872 	int		nbound;
873 	proc_t		*pp;
874 	kthread_t	*tp;
875 	char		err_str[DR_CPU_MAX_ERR_LEN];
876 	size_t		err_len;
877 
878 	/*
879 	 * Error string allocation makes an assumption
880 	 * that no blocking condition has been identified.
881 	 */
882 	ASSERT(res->result == DR_CPU_RES_OK);
883 	ASSERT(res->string == NULL);
884 
885 	ASSERT(MUTEX_HELD(&cpu_lock));
886 
887 	mutex_enter(&pidlock);
888 
889 	nbound = 0;
890 
891 	/*
892 	 * Walk the active processes, checking if each
893 	 * thread belonging to the process is bound.
894 	 */
895 	for (pp = practive; (pp != NULL) && (nbound <= 1); pp = pp->p_next) {
896 		mutex_enter(&pp->p_lock);
897 
898 		tp = pp->p_tlist;
899 
900 		if ((tp == NULL) || (pp->p_flag & SSYS)) {
901 			mutex_exit(&pp->p_lock);
902 			continue;
903 		}
904 
905 		do {
906 			if (tp->t_bind_cpu != cp->cpu_id)
907 				continue;
908 
909 			/*
910 			 * Update the running total of bound
911 			 * threads. Continue the search until
912 			 * it can be determined if more than
913 			 * one thread is bound to the CPU.
914 			 */
915 			if (++nbound > 1)
916 				break;
917 
918 		} while ((tp = tp->t_forw) != pp->p_tlist);
919 
920 		mutex_exit(&pp->p_lock);
921 	}
922 
923 	mutex_exit(&pidlock);
924 
925 	if (nbound) {
926 		/*
927 		 * Threads are bound to the CPU. Fill in
928 		 * various pieces of information to report
929 		 * that the operation will fail.
930 		 */
931 		res->result = DR_CPU_RES_BLOCKED;
932 		res->status = DR_CPU_STAT_CONFIGURED;
933 
934 		(void) snprintf(err_str, DR_CPU_MAX_ERR_LEN, "cpu has bound "
935 		    "thread%s", (nbound > 1) ? "s" : "");
936 
937 		err_len = strlen(err_str) + 1;
938 
939 		res->string = kmem_alloc(err_len, KM_SLEEP);
940 		DR_DBG_KMEM("%s: alloc addr %p size %ld\n",
941 		    __func__, (void *)(res->string), err_len);
942 		bcopy(err_str, res->string, err_len);
943 
944 		DR_DBG_CPU("cpu %d: %s\n", cp->cpu_id, err_str);
945 	}
946 
947 	return (res->result);
948 }
949 
950 /*
951  * Do not modify result buffer or length on error.
952  */
953 static int
954 dr_cpu_list_status(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len)
955 {
956 	int		idx;
957 	int		result;
958 	int		status;
959 	int		rlen;
960 	uint32_t	*cpuids;
961 	dr_cpu_hdr_t	*rp;
962 	dr_cpu_stat_t	*stat;
963 	md_t		*mdp = NULL;
964 	int		num_nodes;
965 	int		listsz;
966 	mde_cookie_t	*listp = NULL;
967 	mde_cookie_t	cpunode;
968 	boolean_t	walk_md = B_FALSE;
969 
970 	/* the incoming array of cpuids to configure */
971 	cpuids = DR_CPU_CMD_CPUIDS(req);
972 
973 	/* allocate a response message */
974 	rlen = sizeof (dr_cpu_hdr_t);
975 	rlen += req->num_records * sizeof (dr_cpu_stat_t);
976 	rp = kmem_zalloc(rlen, KM_SLEEP);
977 	DR_DBG_KMEM("%s: alloc addr %p size %d\n", __func__, (void *)rp, rlen);
978 
979 	/* fill in the known data */
980 	rp->req_num = req->req_num;
981 	rp->msg_type = DR_CPU_STATUS;
982 	rp->num_records = req->num_records;
983 
984 	/* stat array for the response */
985 	stat = DR_CPU_RESP_STATS(rp);
986 
987 	/* get the status for each of the CPUs */
988 	for (idx = 0; idx < req->num_records; idx++) {
989 
990 		result = dr_cpu_status(cpuids[idx], &status);
991 
992 		if (result == DR_CPU_RES_FAILURE)
993 			walk_md = B_TRUE;
994 
995 		/* save off results of the status */
996 		stat[idx].cpuid = cpuids[idx];
997 		stat[idx].result = result;
998 		stat[idx].status = status;
999 	}
1000 
1001 	if (walk_md == B_FALSE)
1002 		goto done;
1003 
1004 	/*
1005 	 * At least one of the cpus did not have a CPU
1006 	 * structure. So, consult the MD to determine if
1007 	 * they are present.
1008 	 */
1009 
1010 	if ((mdp = md_get_handle()) == NULL) {
1011 		DR_DBG_CPU("unable to initialize MD\n");
1012 		goto done;
1013 	}
1014 
1015 	num_nodes = md_node_count(mdp);
1016 	ASSERT(num_nodes > 0);
1017 
1018 	listsz = num_nodes * sizeof (mde_cookie_t);
1019 	listp = kmem_zalloc(listsz, KM_SLEEP);
1020 	DR_DBG_KMEM("%s: alloc addr %p size %d\n",
1021 	    __func__, (void *)listp, listsz);
1022 
1023 	for (idx = 0; idx < req->num_records; idx++) {
1024 
1025 		if (stat[idx].result != DR_CPU_RES_FAILURE)
1026 			continue;
1027 
1028 		/* check the MD for the current cpuid */
1029 		cpunode = dr_cpu_find_node_md(stat[idx].cpuid, mdp, listp);
1030 
1031 		stat[idx].result = DR_CPU_RES_OK;
1032 
1033 		if (cpunode == MDE_INVAL_ELEM_COOKIE) {
1034 			stat[idx].status = DR_CPU_STAT_NOT_PRESENT;
1035 		} else {
1036 			stat[idx].status = DR_CPU_STAT_UNCONFIGURED;
1037 		}
1038 	}
1039 
1040 	DR_DBG_KMEM("%s: free addr %p size %d\n",
1041 	    __func__, (void *)listp, listsz);
1042 	kmem_free(listp, listsz);
1043 
1044 	(void) md_fini_handle(mdp);
1045 
1046 done:
1047 	*resp = rp;
1048 	*resp_len = rlen;
1049 
1050 	return (0);
1051 }
1052 
1053 static int
1054 dr_cpu_configure(processorid_t cpuid, int *status, boolean_t force)
1055 {
1056 	 _NOTE(ARGUNUSED(force))
1057 	struct cpu	*cp;
1058 	int		rv = 0;
1059 
1060 	DR_DBG_CPU("dr_cpu_configure...\n");
1061 
1062 	/*
1063 	 * Build device tree node for the CPU
1064 	 */
1065 	if ((rv = dr_cpu_probe(cpuid)) != 0) {
1066 		DR_DBG_CPU("failed to probe CPU %d (%d)\n", cpuid, rv);
1067 		if (rv == EINVAL) {
1068 			*status = DR_CPU_STAT_NOT_PRESENT;
1069 			return (DR_CPU_RES_NOT_IN_MD);
1070 		}
1071 		*status = DR_CPU_STAT_UNCONFIGURED;
1072 		return (DR_CPU_RES_FAILURE);
1073 	}
1074 
1075 	mutex_enter(&cpu_lock);
1076 
1077 	/*
1078 	 * Configure the CPU
1079 	 */
1080 	if ((cp = cpu_get(cpuid)) == NULL) {
1081 
1082 		if ((rv = cpu_configure(cpuid)) != 0) {
1083 			DR_DBG_CPU("failed to configure CPU %d (%d)\n",
1084 			    cpuid, rv);
1085 			rv = DR_CPU_RES_FAILURE;
1086 			*status = DR_CPU_STAT_UNCONFIGURED;
1087 			goto done;
1088 		}
1089 
1090 		DR_DBG_CPU("CPU %d configured\n", cpuid);
1091 
1092 		/* CPU struct should exist now */
1093 		cp = cpu_get(cpuid);
1094 	}
1095 
1096 	ASSERT(cp);
1097 
1098 	/*
1099 	 * Power on the CPU. In sun4v, this brings the stopped
1100 	 * CPU into the guest from the Hypervisor.
1101 	 */
1102 	if (cpu_is_poweredoff(cp)) {
1103 
1104 		if ((rv = cpu_poweron(cp)) != 0) {
1105 			DR_DBG_CPU("failed to power on CPU %d (%d)\n",
1106 			    cpuid, rv);
1107 			rv = DR_CPU_RES_FAILURE;
1108 			*status = DR_CPU_STAT_UNCONFIGURED;
1109 			goto done;
1110 		}
1111 
1112 		DR_DBG_CPU("CPU %d powered on\n", cpuid);
1113 	}
1114 
1115 	/*
1116 	 * Online the CPU
1117 	 */
1118 	if (cpu_is_offline(cp)) {
1119 
1120 		if ((rv = cpu_online(cp, 0)) != 0) {
1121 			DR_DBG_CPU("failed to online CPU %d (%d)\n",
1122 			    cpuid, rv);
1123 			rv = DR_CPU_RES_FAILURE;
1124 			/* offline is still configured */
1125 			*status = DR_CPU_STAT_CONFIGURED;
1126 			goto done;
1127 		}
1128 
1129 		DR_DBG_CPU("CPU %d online\n", cpuid);
1130 	}
1131 
1132 	rv = DR_CPU_RES_OK;
1133 	*status = DR_CPU_STAT_CONFIGURED;
1134 
1135 done:
1136 	mutex_exit(&cpu_lock);
1137 
1138 	return (rv);
1139 }
1140 
1141 static int
1142 dr_cpu_unconfigure(processorid_t cpuid, int *status, boolean_t force)
1143 {
1144 	struct cpu	*cp;
1145 	int		rv = 0;
1146 	int		cpu_flags;
1147 
1148 	DR_DBG_CPU("dr_cpu_unconfigure%s...\n", (force) ? " (force)" : "");
1149 
1150 	mutex_enter(&cpu_lock);
1151 
1152 	cp = cpu_get(cpuid);
1153 
1154 	if (cp == NULL) {
1155 		/*
1156 		 * As OS CPU structures are already torn down proceed
1157 		 * to deprobe device tree to make sure the device tree
1158 		 * is up do date.
1159 		 */
1160 		goto deprobe;
1161 	}
1162 
1163 	ASSERT(cp->cpu_id == cpuid);
1164 
1165 	/*
1166 	 * Offline the CPU
1167 	 */
1168 	if (cpu_is_active(cp)) {
1169 
1170 		/* set the force flag correctly */
1171 		cpu_flags = (force) ? CPU_FORCED : 0;
1172 
1173 		/*
1174 		 * Before we take the CPU offline, we first enable interrupts.
1175 		 * Otherwise, cpu_offline() might reject the request.  Note:
1176 		 * if the offline subsequently fails, the target cpu will be
1177 		 * left with interrupts enabled.  This is consistent with the
1178 		 * behavior of psradm(1M) and p_online(2).
1179 		 */
1180 		cpu_intr_enable(cp);
1181 
1182 		if ((rv = cpu_offline(cp, cpu_flags)) != 0) {
1183 			DR_DBG_CPU("failed to offline CPU %d (%d)\n",
1184 			    cpuid, rv);
1185 
1186 			rv = DR_CPU_RES_FAILURE;
1187 			*status = DR_CPU_STAT_CONFIGURED;
1188 			mutex_exit(&cpu_lock);
1189 			return (rv);
1190 		}
1191 
1192 		DR_DBG_CPU("CPU %d offline\n", cpuid);
1193 	}
1194 
1195 	/*
1196 	 * Power off the CPU. In sun4v, this puts the running
1197 	 * CPU into the stopped state in the Hypervisor.
1198 	 */
1199 	if (!cpu_is_poweredoff(cp)) {
1200 
1201 		if ((rv = cpu_poweroff(cp)) != 0) {
1202 			DR_DBG_CPU("failed to power off CPU %d (%d)\n",
1203 			    cpuid, rv);
1204 			rv = DR_CPU_RES_FAILURE;
1205 			*status = DR_CPU_STAT_CONFIGURED;
1206 			mutex_exit(&cpu_lock);
1207 			return (rv);
1208 		}
1209 
1210 		DR_DBG_CPU("CPU %d powered off\n", cpuid);
1211 	}
1212 
1213 	/*
1214 	 * Unconfigure the CPU
1215 	 */
1216 	if ((rv = cpu_unconfigure(cpuid)) != 0) {
1217 		DR_DBG_CPU("failed to unconfigure CPU %d (%d)\n", cpuid, rv);
1218 		rv = DR_CPU_RES_FAILURE;
1219 		*status = DR_CPU_STAT_UNCONFIGURED;
1220 		mutex_exit(&cpu_lock);
1221 		return (rv);
1222 	}
1223 
1224 	DR_DBG_CPU("CPU %d unconfigured\n", cpuid);
1225 
1226 deprobe:
1227 	mutex_exit(&cpu_lock);
1228 	/*
1229 	 * Tear down device tree.
1230 	 */
1231 	if ((rv = dr_cpu_deprobe(cpuid)) != 0) {
1232 		DR_DBG_CPU("failed to deprobe CPU %d (%d)\n", cpuid, rv);
1233 		rv = DR_CPU_RES_FAILURE;
1234 		*status = DR_CPU_STAT_UNCONFIGURED;
1235 		return (rv);
1236 	}
1237 
1238 	rv = DR_CPU_RES_OK;
1239 	*status = DR_CPU_STAT_UNCONFIGURED;
1240 
1241 	return (rv);
1242 }
1243 
1244 /*
1245  * Determine the state of a CPU. If the CPU structure is not present,
1246  * it does not attempt to determine whether or not the CPU is in the
1247  * MD. It is more efficient to do this at the higher level for all
1248  * CPUs since it may not even be necessary to search the MD if all
1249  * the CPUs are accounted for. Returns DR_CPU_RES_OK if the CPU
1250  * structure is present, and DR_CPU_RES_FAILURE otherwise as a signal
1251  * that an MD walk is necessary.
1252  */
1253 static int
1254 dr_cpu_status(processorid_t cpuid, int *status)
1255 {
1256 	int		rv;
1257 	struct cpu	*cp;
1258 
1259 	DR_DBG_CPU("dr_cpu_status...\n");
1260 
1261 	mutex_enter(&cpu_lock);
1262 
1263 	if ((cp = cpu_get(cpuid)) == NULL) {
1264 		/* need to check if cpu is in the MD */
1265 		rv = DR_CPU_RES_FAILURE;
1266 		goto done;
1267 	}
1268 
1269 	if (cpu_is_poweredoff(cp)) {
1270 		/*
1271 		 * The CPU is powered off, so it is considered
1272 		 * unconfigured from the service entity point of
1273 		 * view. The CPU is not available to the system
1274 		 * and intervention by the service entity would
1275 		 * be required to change that.
1276 		 */
1277 		*status = DR_CPU_STAT_UNCONFIGURED;
1278 	} else {
1279 		/*
1280 		 * The CPU is powered on, so it is considered
1281 		 * configured from the service entity point of
1282 		 * view. It is available for use by the system
1283 		 * and service entities are not concerned about
1284 		 * the operational status (offline, online, etc.)
1285 		 * of the CPU in terms of DR.
1286 		 */
1287 		*status = DR_CPU_STAT_CONFIGURED;
1288 	}
1289 
1290 	rv = DR_CPU_RES_OK;
1291 
1292 done:
1293 	mutex_exit(&cpu_lock);
1294 
1295 	return (rv);
1296 }
1297 
1298 typedef struct {
1299 	md_t		*mdp;
1300 	mde_cookie_t	cpunode;
1301 	dev_info_t	*dip;
1302 } cb_arg_t;
1303 
1304 #define	STR_ARR_LEN	5
1305 
1306 static int
1307 new_cpu_node(dev_info_t *new_node, void *arg, uint_t flags)
1308 {
1309 	_NOTE(ARGUNUSED(flags))
1310 
1311 	char		*compat;
1312 	uint64_t	freq;
1313 	uint64_t	cpuid = 0;
1314 	int		regbuf[4];
1315 	int		len = 0;
1316 	cb_arg_t	*cba;
1317 	char		*str_arr[STR_ARR_LEN];
1318 	char		*curr;
1319 	int		idx = 0;
1320 
1321 	DR_DBG_CPU("new_cpu_node...\n");
1322 
1323 	cba = (cb_arg_t *)arg;
1324 
1325 	/*
1326 	 * Add 'name' property
1327 	 */
1328 	if (ndi_prop_update_string(DDI_DEV_T_NONE, new_node,
1329 	    "name", "cpu") != DDI_SUCCESS) {
1330 		DR_DBG_CPU("new_cpu_node: failed to create 'name' property\n");
1331 		return (DDI_WALK_ERROR);
1332 	}
1333 
1334 	/*
1335 	 * Add 'compatible' property
1336 	 */
1337 	if (md_get_prop_data(cba->mdp, cba->cpunode, "compatible",
1338 	    (uint8_t **)(&compat), &len)) {
1339 		DR_DBG_CPU("new_cpu_node: failed to read 'compatible' property "
1340 		    "from MD\n");
1341 		return (DDI_WALK_ERROR);
1342 	}
1343 
1344 	DR_DBG_CPU("'compatible' len is %d\n", len);
1345 
1346 	/* parse the MD string array */
1347 	curr = compat;
1348 	while (curr < (compat + len)) {
1349 
1350 		DR_DBG_CPU("adding '%s' to 'compatible' property\n", curr);
1351 
1352 		str_arr[idx++] = curr;
1353 		curr += strlen(curr) + 1;
1354 
1355 		if (idx == STR_ARR_LEN) {
1356 			DR_DBG_CPU("exceeded str_arr len (%d)\n", STR_ARR_LEN);
1357 			break;
1358 		}
1359 	}
1360 
1361 	if (ndi_prop_update_string_array(DDI_DEV_T_NONE, new_node,
1362 	    "compatible", str_arr, idx) != DDI_SUCCESS) {
1363 		DR_DBG_CPU("new_cpu_node: failed to create 'compatible' "
1364 		    "property\n");
1365 		return (DDI_WALK_ERROR);
1366 	}
1367 
1368 	/*
1369 	 * Add 'device_type' property
1370 	 */
1371 	if (ndi_prop_update_string(DDI_DEV_T_NONE, new_node,
1372 	    "device_type", "cpu") != DDI_SUCCESS) {
1373 		DR_DBG_CPU("new_cpu_node: failed to create 'device_type' "
1374 		    "property\n");
1375 		return (DDI_WALK_ERROR);
1376 	}
1377 
1378 	/*
1379 	 * Add 'clock-frequency' property
1380 	 */
1381 	if (md_get_prop_val(cba->mdp, cba->cpunode, "clock-frequency", &freq)) {
1382 		DR_DBG_CPU("new_cpu_node: failed to read 'clock-frequency' "
1383 		    "property from MD\n");
1384 		return (DDI_WALK_ERROR);
1385 	}
1386 
1387 	if (ndi_prop_update_int(DDI_DEV_T_NONE, new_node,
1388 	    "clock-frequency", freq) != DDI_SUCCESS) {
1389 		DR_DBG_CPU("new_cpu_node: failed to create 'clock-frequency' "
1390 		    "property\n");
1391 		return (DDI_WALK_ERROR);
1392 	}
1393 
1394 	/*
1395 	 * Add 'reg' (cpuid) property
1396 	 */
1397 	if (md_get_prop_val(cba->mdp, cba->cpunode, "id", &cpuid)) {
1398 		DR_DBG_CPU("new_cpu_node: failed to read 'id' property "
1399 		    "from MD\n");
1400 		return (DDI_WALK_ERROR);
1401 	}
1402 
1403 	DR_DBG_CPU("new cpuid=0x%lx\n", cpuid);
1404 
1405 	bzero(regbuf, 4 * sizeof (int));
1406 	regbuf[0] = 0xc0000000 | cpuid;
1407 
1408 	if (ndi_prop_update_int_array(DDI_DEV_T_NONE, new_node,
1409 	    "reg", regbuf, 4) != DDI_SUCCESS) {
1410 		DR_DBG_CPU("new_cpu_node: failed to create 'reg' property\n");
1411 		return (DDI_WALK_ERROR);
1412 	}
1413 
1414 	cba->dip = new_node;
1415 
1416 	return (DDI_WALK_TERMINATE);
1417 }
1418 
1419 static int
1420 dr_cpu_probe(processorid_t cpuid)
1421 {
1422 	dev_info_t	*pdip;
1423 	dev_info_t	*dip;
1424 	devi_branch_t	br;
1425 	md_t		*mdp = NULL;
1426 	int		num_nodes;
1427 	int		rv = 0;
1428 	int		listsz;
1429 	mde_cookie_t	*listp = NULL;
1430 	cb_arg_t	cba;
1431 	mde_cookie_t	cpunode;
1432 
1433 	if ((dip = dr_cpu_find_node(cpuid)) != NULL) {
1434 		/* nothing to do */
1435 		e_ddi_branch_rele(dip);
1436 		return (0);
1437 	}
1438 
1439 	if ((mdp = md_get_handle()) == NULL) {
1440 		DR_DBG_CPU("unable to initialize machine description\n");
1441 		return (-1);
1442 	}
1443 
1444 	num_nodes = md_node_count(mdp);
1445 	ASSERT(num_nodes > 0);
1446 
1447 	listsz = num_nodes * sizeof (mde_cookie_t);
1448 	listp = kmem_zalloc(listsz, KM_SLEEP);
1449 	DR_DBG_KMEM("%s: alloc addr %p size %d\n",
1450 	    __func__, (void *)listp, listsz);
1451 
1452 	cpunode = dr_cpu_find_node_md(cpuid, mdp, listp);
1453 
1454 	if (cpunode == MDE_INVAL_ELEM_COOKIE) {
1455 		rv = EINVAL;
1456 		goto done;
1457 	}
1458 
1459 	/* pass in MD cookie for CPU */
1460 	cba.mdp = mdp;
1461 	cba.cpunode = cpunode;
1462 
1463 	br.arg = (void *)&cba;
1464 	br.type = DEVI_BRANCH_SID;
1465 	br.create.sid_branch_create = new_cpu_node;
1466 	br.devi_branch_callback = NULL;
1467 	pdip = ddi_root_node();
1468 
1469 	if ((rv = e_ddi_branch_create(pdip, &br, NULL, 0))) {
1470 		DR_DBG_CPU("e_ddi_branch_create failed: %d\n", rv);
1471 		rv = -1;
1472 		goto done;
1473 	}
1474 
1475 	DR_DBG_CPU("CPU %d probed\n", cpuid);
1476 
1477 	rv = 0;
1478 
1479 done:
1480 	if (listp) {
1481 		DR_DBG_KMEM("%s: free addr %p size %d\n",
1482 		    __func__, (void *)listp, listsz);
1483 		kmem_free(listp, listsz);
1484 	}
1485 
1486 	if (mdp)
1487 		(void) md_fini_handle(mdp);
1488 
1489 	return (rv);
1490 }
1491 
1492 static int
1493 dr_cpu_deprobe(processorid_t cpuid)
1494 {
1495 	dev_info_t	*fdip = NULL;
1496 	dev_info_t	*dip;
1497 
1498 	if ((dip = dr_cpu_find_node(cpuid)) == NULL) {
1499 		DR_DBG_CPU("cpuid %d already deprobed\n", cpuid);
1500 		return (0);
1501 	}
1502 
1503 	ASSERT(e_ddi_branch_held(dip));
1504 
1505 	if (e_ddi_branch_destroy(dip, &fdip, 0)) {
1506 		char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1507 
1508 		DR_DBG_KMEM("%s: alloc addr %p size %d\n",
1509 		    __func__, (void *)path, MAXPATHLEN);
1510 		/*
1511 		 * If non-NULL, fdip is held and must be released.
1512 		 */
1513 		if (fdip != NULL) {
1514 			(void) ddi_pathname(fdip, path);
1515 			ddi_release_devi(fdip);
1516 		} else {
1517 			(void) ddi_pathname(dip, path);
1518 		}
1519 		cmn_err(CE_NOTE, "node removal failed: %s (%p)",
1520 		    path, (fdip) ? (void *)fdip : (void *)dip);
1521 
1522 		DR_DBG_KMEM("%s: free addr %p size %d\n",
1523 		    __func__, (void *)path, MAXPATHLEN);
1524 		kmem_free(path, MAXPATHLEN);
1525 
1526 		return (-1);
1527 	}
1528 
1529 	DR_DBG_CPU("CPU %d deprobed\n", cpuid);
1530 
1531 	return (0);
1532 }
1533 
1534 typedef struct {
1535 	processorid_t	cpuid;
1536 	dev_info_t	*dip;
1537 } dr_search_arg_t;
1538 
1539 static int
1540 dr_cpu_check_node(dev_info_t *dip, void *arg)
1541 {
1542 	char		*name;
1543 	processorid_t	cpuid;
1544 	dr_search_arg_t	*sarg = (dr_search_arg_t *)arg;
1545 
1546 	if (dip == ddi_root_node()) {
1547 		return (DDI_WALK_CONTINUE);
1548 	}
1549 
1550 	name = ddi_node_name(dip);
1551 
1552 	if (strcmp(name, "cpu") != 0) {
1553 		return (DDI_WALK_PRUNECHILD);
1554 	}
1555 
1556 	cpuid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
1557 	    "reg", -1);
1558 
1559 	cpuid = PROM_CFGHDL_TO_CPUID(cpuid);
1560 
1561 	DR_DBG_CPU("found cpuid=0x%x, looking for 0x%x\n", cpuid, sarg->cpuid);
1562 
1563 	if (cpuid == sarg->cpuid) {
1564 		DR_DBG_CPU("matching node\n");
1565 
1566 		/* matching node must be returned held */
1567 		if (!e_ddi_branch_held(dip))
1568 			e_ddi_branch_hold(dip);
1569 
1570 		sarg->dip = dip;
1571 		return (DDI_WALK_TERMINATE);
1572 	}
1573 
1574 	return (DDI_WALK_CONTINUE);
1575 }
1576 
1577 /*
1578  * Walk the device tree to find the dip corresponding to the cpuid
1579  * passed in. If present, the dip is returned held. The caller must
1580  * release the hold on the dip once it is no longer required. If no
1581  * matching node if found, NULL is returned.
1582  */
1583 static dev_info_t *
1584 dr_cpu_find_node(processorid_t cpuid)
1585 {
1586 	dr_search_arg_t	arg;
1587 
1588 	DR_DBG_CPU("dr_cpu_find_node...\n");
1589 
1590 	arg.cpuid = cpuid;
1591 	arg.dip = NULL;
1592 
1593 	ddi_walk_devs(ddi_root_node(), dr_cpu_check_node, &arg);
1594 
1595 	ASSERT((arg.dip == NULL) || (e_ddi_branch_held(arg.dip)));
1596 
1597 	return ((arg.dip) ? arg.dip : NULL);
1598 }
1599 
1600 /*
1601  * Look up a particular cpuid in the MD. Returns the mde_cookie_t
1602  * representing that CPU if present, and MDE_INVAL_ELEM_COOKIE
1603  * otherwise. It is assumed the scratch array has already been
1604  * allocated so that it can accommodate the worst case scenario,
1605  * every node in the MD.
1606  */
1607 static mde_cookie_t
1608 dr_cpu_find_node_md(processorid_t cpuid, md_t *mdp, mde_cookie_t *listp)
1609 {
1610 	int		idx;
1611 	int		nnodes;
1612 	mde_cookie_t	rootnode;
1613 	uint64_t	cpuid_prop;
1614 	mde_cookie_t	result = MDE_INVAL_ELEM_COOKIE;
1615 
1616 	rootnode = md_root_node(mdp);
1617 	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
1618 
1619 	/*
1620 	 * Scan the DAG for all the CPU nodes
1621 	 */
1622 	nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "cpu"),
1623 	    md_find_name(mdp, "fwd"), listp);
1624 
1625 	if (nnodes < 0) {
1626 		DR_DBG_CPU("Scan for CPUs failed\n");
1627 		return (result);
1628 	}
1629 
1630 	DR_DBG_CPU("dr_cpu_find_node_md: found %d CPUs in the MD\n", nnodes);
1631 
1632 	/*
1633 	 * Find the CPU of interest
1634 	 */
1635 	for (idx = 0; idx < nnodes; idx++) {
1636 
1637 		if (md_get_prop_val(mdp, listp[idx], "id", &cpuid_prop)) {
1638 			DR_DBG_CPU("Missing 'id' property for CPU node %d\n",
1639 			    idx);
1640 			break;
1641 		}
1642 
1643 		if (cpuid_prop == cpuid) {
1644 			/* found a match */
1645 			DR_DBG_CPU("dr_cpu_find_node_md: found CPU %d "
1646 			    "in MD\n", cpuid);
1647 			result = listp[idx];
1648 			break;
1649 		}
1650 	}
1651 
1652 	if (result == MDE_INVAL_ELEM_COOKIE) {
1653 		DR_DBG_CPU("CPU %d not in MD\n", cpuid);
1654 	}
1655 
1656 	return (result);
1657 }
1658