xref: /illumos-gate/usr/src/uts/common/disp/cmt.c (revision df3cd224ef765c29101e4110546062199562f757)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/thread.h>
30 #include <sys/cpuvar.h>
31 #include <sys/cpupart.h>
32 #include <sys/kmem.h>
33 #include <sys/cmn_err.h>
34 #include <sys/kstat.h>
35 #include <sys/processor.h>
36 #include <sys/disp.h>
37 #include <sys/group.h>
38 #include <sys/pghw.h>
39 #include <sys/bitset.h>
40 #include <sys/lgrp.h>
41 #include <sys/cmt.h>
42 #include <sys/cpu_pm.h>
43 
44 /*
45  * CMT scheduler / dispatcher support
46  *
47  * This file implements CMT scheduler support using Processor Groups.
48  * The CMT processor group class creates and maintains the CMT class
49  * specific processor group pg_cmt_t.
50  *
51  * ---------------------------- <-- pg_cmt_t *
52  * | pghw_t                   |
53  * ----------------------------
54  * | CMT class specific data  |
55  * | - hierarchy linkage      |
56  * | - CMT load balancing data|
57  * | - active CPU group/bitset|
58  * ----------------------------
59  *
60  * The scheduler/dispatcher leverages knowledge of the performance
61  * relevant CMT sharing relationships existing between cpus to implement
62  * optimized affinity, load balancing, and coalescence policies.
63  *
64  * Load balancing policy seeks to improve performance by minimizing
65  * contention over shared processor resources / facilities, Affinity
66  * policies seek to improve cache and TLB utilization. Coalescence
67  * policies improve resource utilization and ultimately power efficiency.
68  *
69  * The CMT PGs created by this class are already arranged into a
70  * hierarchy (which is done in the pghw layer). To implement the top-down
71  * CMT load balancing algorithm, the CMT PGs additionally maintain
72  * parent, child and sibling hierarchy relationships.
73  * Parent PGs always contain a superset of their children(s) resources,
74  * each PG can have at most one parent, and siblings are the group of PGs
75  * sharing the same parent.
76  *
77  * On UMA based systems, the CMT load balancing algorithm begins by balancing
78  * load across the group of top level PGs in the system hierarchy.
79  * On NUMA systems, the CMT load balancing algorithm balances load across the
80  * group of top level PGs in each leaf lgroup...but for root homed threads,
81  * is willing to balance against all the top level PGs in the system.
82  *
83  * Groups of top level PGs are maintained to implement the above, one for each
84  * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
85  * root lgroup) that contains all the top level PGs in the system.
86  */
87 static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
88 static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
89 						/* used for null_proc_lpa */
90 cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */
91 
92 static int		is_cpu0 = 1; /* true if this is boot CPU context */
93 
94 /*
95  * Array of hardware sharing relationships that are blacklisted.
96  * CMT scheduling optimizations won't be performed for blacklisted sharing
97  * relationships.
98  */
99 static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
100 
101 /*
102  * Set this to non-zero to disable CMT scheduling
103  * This must be done via kmdb -d, as /etc/system will be too late
104  */
105 int			cmt_sched_disabled = 0;
106 
107 /*
108  * Status codes for CMT lineage validation
109  * See pg_cmt_lineage_validate() below
110  */
111 typedef enum cmt_lineage_validation {
112 	CMT_LINEAGE_VALID,
113 	CMT_LINEAGE_NON_CONCENTRIC,
114 	CMT_LINEAGE_PG_SPANS_LGRPS,
115 	CMT_LINEAGE_NON_PROMOTABLE,
116 	CMT_LINEAGE_REPAIRED,
117 	CMT_LINEAGE_UNRECOVERABLE
118 } cmt_lineage_validation_t;
119 
120 /*
121  * Status of the current lineage under construction.
122  * One must be holding cpu_lock to change this.
123  */
124 cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;
125 
126 /*
127  * Power domain definitions (on x86) are defined by ACPI, and
128  * therefore may be subject to BIOS bugs.
129  */
130 #define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)
131 
132 /*
133  * Macro to test if PG is managed by the CMT PG class
134  */
135 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
136 
137 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
138 
139 static pg_t		*pg_cmt_alloc();
140 static void		pg_cmt_free(pg_t *);
141 static void		pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
142 static void		pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
143 static void		pg_cmt_cpu_active(cpu_t *);
144 static void		pg_cmt_cpu_inactive(cpu_t *);
145 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
146 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
147 static char		*pg_cmt_policy_name(pg_t *);
148 static void		pg_cmt_hier_sort(pg_cmt_t **, int);
149 static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
150 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
151 static int		pg_cmt_hw(pghw_type_t);
152 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
153 static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
154 static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
155 			    kthread_t *, kthread_t *);
156 static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
157 			    kthread_t *, kthread_t *);
158 static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
159 static cmt_lineage_validation_t	pg_cmt_lineage_validate(pg_cmt_t **, int *,
160 			    cpu_pg_t *);
161 
162 /*
163  * CMT PG ops
164  */
165 struct pg_ops pg_ops_cmt = {
166 	pg_cmt_alloc,
167 	pg_cmt_free,
168 	pg_cmt_cpu_init,
169 	pg_cmt_cpu_fini,
170 	pg_cmt_cpu_active,
171 	pg_cmt_cpu_inactive,
172 	pg_cmt_cpupart_in,
173 	NULL,			/* cpupart_out */
174 	pg_cmt_cpupart_move,
175 	pg_cmt_cpu_belongs,
176 	pg_cmt_policy_name,
177 };
178 
179 /*
180  * Initialize the CMT PG class
181  */
182 void
183 pg_cmt_class_init(void)
184 {
185 	if (cmt_sched_disabled)
186 		return;
187 
188 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
189 }
190 
191 /*
192  * Called to indicate a new CPU has started up so
193  * that either t0 or the slave startup thread can
194  * be accounted for.
195  */
196 void
197 pg_cmt_cpu_startup(cpu_t *cp)
198 {
199 	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
200 	    cp->cpu_thread);
201 }
202 
203 /*
204  * Return non-zero if thread can migrate between "from" and "to"
205  * without a performance penalty
206  */
207 int
208 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
209 {
210 	if (from->cpu_physid->cpu_cacheid ==
211 	    to->cpu_physid->cpu_cacheid)
212 		return (1);
213 	return (0);
214 }
215 
216 /*
217  * CMT class specific PG allocation
218  */
219 static pg_t *
220 pg_cmt_alloc(void)
221 {
222 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
223 }
224 
225 /*
226  * Class specific PG de-allocation
227  */
228 static void
229 pg_cmt_free(pg_t *pg)
230 {
231 	ASSERT(pg != NULL);
232 	ASSERT(IS_CMT_PG(pg));
233 
234 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
235 }
236 
237 /*
238  * Given a hardware sharing relationship, return which dispatcher
239  * policies should be implemented to optimize performance and efficiency
240  */
241 static pg_cmt_policy_t
242 pg_cmt_policy(pghw_type_t hw)
243 {
244 	pg_cmt_policy_t p;
245 
246 	/*
247 	 * Give the platform a chance to override the default
248 	 */
249 	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
250 		return (p);
251 
252 	switch (hw) {
253 	case PGHW_IPIPE:
254 	case PGHW_FPU:
255 	case PGHW_PROCNODE:
256 	case PGHW_CHIP:
257 		return (CMT_BALANCE);
258 	case PGHW_CACHE:
259 		return (CMT_AFFINITY);
260 	case PGHW_POW_ACTIVE:
261 	case PGHW_POW_IDLE:
262 		return (CMT_BALANCE);
263 	default:
264 		return (CMT_NO_POLICY);
265 	}
266 }
267 
268 /*
269  * Rank the importance of optimizing for the pg1 relationship vs.
270  * the pg2 relationship.
271  */
272 static pg_cmt_t *
273 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
274 {
275 	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
276 	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
277 
278 	/*
279 	 * A power domain is only important if CPUPM is enabled.
280 	 */
281 	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
282 		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
283 			return (pg2);
284 		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
285 			return (pg1);
286 	}
287 
288 	/*
289 	 * Otherwise, ask the platform
290 	 */
291 	if (pg_plat_hw_rank(hw1, hw2) == hw1)
292 		return (pg1);
293 	else
294 		return (pg2);
295 }
296 
297 /*
298  * Initialize CMT callbacks for the given PG
299  */
300 static void
301 cmt_callback_init(pg_t *pg)
302 {
303 	/*
304 	 * Stick with the default callbacks if there isn't going to be
305 	 * any CMT thread placement optimizations implemented.
306 	 */
307 	if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY)
308 		return;
309 
310 	switch (((pghw_t *)pg)->pghw_hw) {
311 	case PGHW_POW_ACTIVE:
312 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
313 		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
314 		break;
315 	default:
316 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
317 
318 	}
319 }
320 
321 /*
322  * Promote PG above it's current parent.
323  * This is only legal if PG has an equal or greater number of CPUs than its
324  * parent.
325  *
326  * This routine operates on the CPU specific processor group data (for the CPUs
327  * in the PG being promoted), and may be invoked from a context where one CPU's
328  * PG data is under construction. In this case the argument "pgdata", if not
329  * NULL, is a reference to the CPU's under-construction PG data.
330  */
331 static void
332 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
333 {
334 	pg_cmt_t	*parent;
335 	group_t		*children;
336 	cpu_t		*cpu;
337 	group_iter_t	iter;
338 	pg_cpu_itr_t	cpu_iter;
339 	int		r;
340 	int		err;
341 	int		nchildren;
342 
343 	ASSERT(MUTEX_HELD(&cpu_lock));
344 
345 	parent = pg->cmt_parent;
346 	if (parent == NULL) {
347 		/*
348 		 * Nothing to do
349 		 */
350 		return;
351 	}
352 
353 	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
354 
355 	/*
356 	 * We're changing around the hierarchy, which is actively traversed
357 	 * by the dispatcher. Pause CPUS to ensure exclusivity.
358 	 */
359 	pause_cpus(NULL);
360 
361 	/*
362 	 * If necessary, update the parent's sibling set, replacing parent
363 	 * with PG.
364 	 */
365 	if (parent->cmt_siblings) {
366 		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
367 		    != -1) {
368 			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
369 			ASSERT(r != -1);
370 		}
371 	}
372 
373 	/*
374 	 * If the parent is at the top of the hierarchy, replace it's entry
375 	 * in the root lgroup's group of top level PGs.
376 	 */
377 	if (parent->cmt_parent == NULL &&
378 	    parent->cmt_siblings != &cmt_root->cl_pgs) {
379 		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
380 		    != -1) {
381 			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
382 			ASSERT(r != -1);
383 		}
384 	}
385 
386 	/*
387 	 * We assume (and therefore assert) that the PG being promoted is an
388 	 * only child of it's parent. Update the parent's children set
389 	 * replacing PG's entry with the parent (since the parent is becoming
390 	 * the child). Then have PG and the parent swap children sets and
391 	 * children counts.
392 	 */
393 	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
394 	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
395 		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
396 		ASSERT(r != -1);
397 	}
398 
399 	children = pg->cmt_children;
400 	pg->cmt_children = parent->cmt_children;
401 	parent->cmt_children = children;
402 
403 	nchildren = pg->cmt_nchildren;
404 	pg->cmt_nchildren = parent->cmt_nchildren;
405 	parent->cmt_nchildren = nchildren;
406 
407 	/*
408 	 * Update the sibling references for PG and it's parent
409 	 */
410 	pg->cmt_siblings = parent->cmt_siblings;
411 	parent->cmt_siblings = pg->cmt_children;
412 
413 	/*
414 	 * Update any cached lineages in the per CPU pg data.
415 	 */
416 	PG_CPU_ITR_INIT(pg, cpu_iter);
417 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
418 		int		idx;
419 		int		sz;
420 		pg_cmt_t	*cpu_pg;
421 		cpu_pg_t	*pgd;	/* CPU's PG data */
422 
423 		/*
424 		 * The CPU's whose lineage is under construction still
425 		 * references the bootstrap CPU PG data structure.
426 		 */
427 		if (pg_cpu_is_bootstrapped(cpu))
428 			pgd = pgdata;
429 		else
430 			pgd = cpu->cpu_pg;
431 
432 		/*
433 		 * Iterate over the CPU's PGs updating the children
434 		 * of the PG being promoted, since they have a new parent.
435 		 */
436 		group_iter_init(&iter);
437 		while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
438 			if (cpu_pg->cmt_parent == pg) {
439 				cpu_pg->cmt_parent = parent;
440 			}
441 		}
442 
443 		/*
444 		 * Update the CMT load balancing lineage
445 		 */
446 		if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
447 			/*
448 			 * Unless this is the CPU who's lineage is being
449 			 * constructed, the PG being promoted should be
450 			 * in the lineage.
451 			 */
452 			ASSERT(pg_cpu_is_bootstrapped(cpu));
453 			continue;
454 		}
455 
456 		ASSERT(idx > 0);
457 		ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);
458 
459 		/*
460 		 * Have the child and the parent swap places in the CPU's
461 		 * lineage
462 		 */
463 		group_remove_at(&pgd->cmt_pgs, idx);
464 		group_remove_at(&pgd->cmt_pgs, idx - 1);
465 		err = group_add_at(&pgd->cmt_pgs, parent, idx);
466 		ASSERT(err == 0);
467 		err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
468 		ASSERT(err == 0);
469 
470 		/*
471 		 * Ensure cmt_lineage references CPU's leaf PG.
472 		 * Since cmt_pgs is top-down ordered, the bottom is the last
473 		 * element.
474 		 */
475 		if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0)
476 			pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1);
477 	}
478 
479 	/*
480 	 * Update the parent references for PG and it's parent
481 	 */
482 	pg->cmt_parent = parent->cmt_parent;
483 	parent->cmt_parent = pg;
484 
485 	start_cpus();
486 }
487 
488 /*
489  * CMT class callback for a new CPU entering the system
490  *
491  * This routine operates on the CPU specific processor group data (for the CPU
492  * being initialized). The argument "pgdata" is a reference to the CPU's PG
493  * data to be constructed.
494  *
495  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
496  * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
497  * calls must be careful to operate only on the "pgdata" argument, and not
498  * cp->cpu_pg.
499  */
500 static void
501 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
502 {
503 	pg_cmt_t	*pg;
504 	group_t		*cmt_pgs;
505 	int		levels, level;
506 	pghw_type_t	hw;
507 	pg_t		*pg_cache = NULL;
508 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
509 	lgrp_handle_t	lgrp_handle;
510 	cmt_lgrp_t	*lgrp;
511 	cmt_lineage_validation_t	lineage_status;
512 
513 	ASSERT(MUTEX_HELD(&cpu_lock));
514 	ASSERT(pg_cpu_is_bootstrapped(cp));
515 
516 	if (cmt_sched_disabled)
517 		return;
518 
519 	/*
520 	 * A new CPU is coming into the system.
521 	 * Interrogate the platform to see if the CPU
522 	 * has any performance or efficiency relevant
523 	 * sharing relationships
524 	 */
525 	cmt_pgs = &pgdata->cmt_pgs;
526 	pgdata->cmt_lineage = NULL;
527 
528 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
529 	levels = 0;
530 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
531 
532 		pg_cmt_policy_t	policy;
533 
534 		/*
535 		 * We're only interested in the hw sharing relationships
536 		 * for which we know how to optimize.
537 		 */
538 		policy = pg_cmt_policy(hw);
539 		if (policy == CMT_NO_POLICY ||
540 		    pg_plat_hw_shared(cp, hw) == 0)
541 			continue;
542 
543 		/*
544 		 * We will still create the PGs for hardware sharing
545 		 * relationships that have been blacklisted, but won't
546 		 * implement CMT thread placement optimizations against them.
547 		 */
548 		if (cmt_hw_blacklisted[hw] == 1)
549 			policy = CMT_NO_POLICY;
550 
551 		/*
552 		 * Find (or create) the PG associated with
553 		 * the hw sharing relationship in which cp
554 		 * belongs.
555 		 *
556 		 * Determine if a suitable PG already
557 		 * exists, or if one needs to be created.
558 		 */
559 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
560 		if (pg == NULL) {
561 			/*
562 			 * Create a new one.
563 			 * Initialize the common...
564 			 */
565 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
566 
567 			/* ... physical ... */
568 			pghw_init((pghw_t *)pg, cp, hw);
569 
570 			/*
571 			 * ... and CMT specific portions of the
572 			 * structure.
573 			 */
574 			pg->cmt_policy = policy;
575 
576 			/* CMT event callbacks */
577 			cmt_callback_init((pg_t *)pg);
578 
579 			bitset_init(&pg->cmt_cpus_actv_set);
580 			group_create(&pg->cmt_cpus_actv);
581 		} else {
582 			ASSERT(IS_CMT_PG(pg));
583 		}
584 
585 		((pghw_t *)pg)->pghw_generation++;
586 
587 		/* Add the CPU to the PG */
588 		pg_cpu_add((pg_t *)pg, cp, pgdata);
589 
590 		/*
591 		 * Ensure capacity of the active CPU group/bitset
592 		 */
593 		group_expand(&pg->cmt_cpus_actv,
594 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
595 
596 		if (cp->cpu_seqid >=
597 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
598 			bitset_resize(&pg->cmt_cpus_actv_set,
599 			    cp->cpu_seqid + 1);
600 		}
601 
602 		/*
603 		 * Build a lineage of CMT PGs for load balancing / coalescence
604 		 */
605 		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
606 			cpu_cmt_hier[levels++] = pg;
607 		}
608 
609 		/* Cache this for later */
610 		if (hw == PGHW_CACHE)
611 			pg_cache = (pg_t *)pg;
612 	}
613 
614 	group_expand(cmt_pgs, levels);
615 
616 	if (cmt_root == NULL)
617 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
618 
619 	/*
620 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
621 	 */
622 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
623 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
624 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
625 
626 	/*
627 	 * Ascendingly sort the PGs in the lineage by number of CPUs
628 	 */
629 	pg_cmt_hier_sort(cpu_cmt_hier, levels);
630 
631 	/*
632 	 * Examine the lineage and validate it.
633 	 * This routine will also try to fix the lineage along with the
634 	 * rest of the PG hierarchy should it detect an issue.
635 	 *
636 	 * If it returns anything other than VALID or REPAIRED, an
637 	 * unrecoverable error has occurred, and we cannot proceed.
638 	 */
639 	lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
640 	if ((lineage_status != CMT_LINEAGE_VALID) &&
641 	    (lineage_status != CMT_LINEAGE_REPAIRED)) {
642 		/*
643 		 * In the case of an unrecoverable error where CMT scheduling
644 		 * has been disabled, assert that the under construction CPU's
645 		 * PG data has an empty CMT load balancing lineage.
646 		 */
647 		ASSERT((cmt_sched_disabled == 0) ||
648 		    (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
649 		return;
650 	}
651 
652 	/*
653 	 * For existing PGs in the lineage, verify that the parent is
654 	 * correct, as the generation in the lineage may have changed
655 	 * as a result of the sorting. Start the traversal at the top
656 	 * of the lineage, moving down.
657 	 */
658 	for (level = levels - 1; level >= 0; ) {
659 		int reorg;
660 
661 		reorg = 0;
662 		pg = cpu_cmt_hier[level];
663 
664 		/*
665 		 * Promote PGs at an incorrect generation into place.
666 		 */
667 		while (pg->cmt_parent &&
668 		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
669 			cmt_hier_promote(pg, pgdata);
670 			reorg++;
671 		}
672 		if (reorg > 0)
673 			level = levels - 1;
674 		else
675 			level--;
676 	}
677 
678 	/*
679 	 * For each of the PGs in the CPU's lineage:
680 	 *	- Add an entry in the CPU sorted CMT PG group
681 	 *	  which is used for top down CMT load balancing
682 	 *	- Tie the PG into the CMT hierarchy by connecting
683 	 *	  it to it's parent and siblings.
684 	 */
685 	for (level = 0; level < levels; level++) {
686 		uint_t		children;
687 		int		err;
688 
689 		pg = cpu_cmt_hier[level];
690 		err = group_add_at(cmt_pgs, pg, levels - level - 1);
691 		ASSERT(err == 0);
692 
693 		if (level == 0)
694 			pgdata->cmt_lineage = (pg_t *)pg;
695 
696 		if (pg->cmt_siblings != NULL) {
697 			/* Already initialized */
698 			ASSERT(pg->cmt_parent == NULL ||
699 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
700 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
701 			    ((pg->cmt_parent != NULL) &&
702 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
703 			continue;
704 		}
705 
706 		if ((level + 1) == levels) {
707 			pg->cmt_parent = NULL;
708 
709 			pg->cmt_siblings = &lgrp->cl_pgs;
710 			children = ++lgrp->cl_npgs;
711 			if (cmt_root != lgrp)
712 				cmt_root->cl_npgs++;
713 		} else {
714 			pg->cmt_parent = cpu_cmt_hier[level + 1];
715 
716 			/*
717 			 * A good parent keeps track of their children.
718 			 * The parent's children group is also the PG's
719 			 * siblings.
720 			 */
721 			if (pg->cmt_parent->cmt_children == NULL) {
722 				pg->cmt_parent->cmt_children =
723 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
724 				group_create(pg->cmt_parent->cmt_children);
725 			}
726 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
727 			children = ++pg->cmt_parent->cmt_nchildren;
728 		}
729 
730 		group_expand(pg->cmt_siblings, children);
731 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
732 	}
733 
734 	/*
735 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
736 	 * for fast lookups later.
737 	 */
738 	if (cp->cpu_physid) {
739 		cp->cpu_physid->cpu_chipid =
740 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
741 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
742 
743 		/*
744 		 * If this cpu has a PG representing shared cache, then set
745 		 * cpu_cacheid to that PG's logical id
746 		 */
747 		if (pg_cache)
748 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
749 	}
750 
751 	/* CPU0 only initialization */
752 	if (is_cpu0) {
753 		is_cpu0 = 0;
754 		cpu0_lgrp = lgrp;
755 	}
756 
757 }
758 
759 /*
760  * Class callback when a CPU is leaving the system (deletion)
761  *
762  * "pgdata" is a reference to the CPU's PG data to be deconstructed.
763  *
764  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
765  * references a "bootstrap" structure across this function's invocation.
766  * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
767  * on the "pgdata" argument, and not cp->cpu_pg.
768  */
769 static void
770 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
771 {
772 	group_iter_t	i;
773 	pg_cmt_t	*pg;
774 	group_t		*pgs, *cmt_pgs;
775 	lgrp_handle_t	lgrp_handle;
776 	cmt_lgrp_t	*lgrp;
777 
778 	if (cmt_sched_disabled)
779 		return;
780 
781 	ASSERT(pg_cpu_is_bootstrapped(cp));
782 
783 	pgs = &pgdata->pgs;
784 	cmt_pgs = &pgdata->cmt_pgs;
785 
786 	/*
787 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
788 	 */
789 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
790 
791 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
792 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
793 		/*
794 		 * One might wonder how we could be deconfiguring the
795 		 * only CPU in the system.
796 		 *
797 		 * On Starcat systems when null_proc_lpa is detected,
798 		 * the boot CPU (which is already configured into a leaf
799 		 * lgroup), is moved into the root lgroup. This is done by
800 		 * deconfiguring it from both lgroups and processor
801 		 * groups), and then later reconfiguring it back in.  This
802 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
803 		 *
804 		 * This special case is detected by noting that the platform
805 		 * has changed the CPU's lgrp affiliation (since it now
806 		 * belongs in the root). In this case, use the cmt_lgrp_t
807 		 * cached for the boot CPU, since this is what needs to be
808 		 * torn down.
809 		 */
810 		lgrp = cpu0_lgrp;
811 	}
812 
813 	ASSERT(lgrp != NULL);
814 
815 	/*
816 	 * First, clean up anything load balancing specific for each of
817 	 * the CPU's PGs that participated in CMT load balancing
818 	 */
819 	pg = (pg_cmt_t *)pgdata->cmt_lineage;
820 	while (pg != NULL) {
821 
822 		((pghw_t *)pg)->pghw_generation++;
823 
824 		/*
825 		 * Remove the PG from the CPU's load balancing lineage
826 		 */
827 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
828 
829 		/*
830 		 * If it's about to become empty, destroy it's children
831 		 * group, and remove it's reference from it's siblings.
832 		 * This is done here (rather than below) to avoid removing
833 		 * our reference from a PG that we just eliminated.
834 		 */
835 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
836 			if (pg->cmt_children != NULL)
837 				group_destroy(pg->cmt_children);
838 			if (pg->cmt_siblings != NULL) {
839 				if (pg->cmt_siblings == &lgrp->cl_pgs)
840 					lgrp->cl_npgs--;
841 				else
842 					pg->cmt_parent->cmt_nchildren--;
843 			}
844 		}
845 		pg = pg->cmt_parent;
846 	}
847 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
848 
849 	/*
850 	 * Now that the load balancing lineage updates have happened,
851 	 * remove the CPU from all it's PGs (destroying any that become
852 	 * empty).
853 	 */
854 	group_iter_init(&i);
855 	while ((pg = group_iterate(pgs, &i)) != NULL) {
856 		if (IS_CMT_PG(pg) == 0)
857 			continue;
858 
859 		pg_cpu_delete((pg_t *)pg, cp, pgdata);
860 		/*
861 		 * Deleting the CPU from the PG changes the CPU's
862 		 * PG group over which we are actively iterating
863 		 * Re-initialize the iteration
864 		 */
865 		group_iter_init(&i);
866 
867 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
868 
869 			/*
870 			 * The PG has become zero sized, so destroy it.
871 			 */
872 			group_destroy(&pg->cmt_cpus_actv);
873 			bitset_fini(&pg->cmt_cpus_actv_set);
874 			pghw_fini((pghw_t *)pg);
875 
876 			pg_destroy((pg_t *)pg);
877 		}
878 	}
879 }
880 
881 /*
882  * Class callback when a CPU is entering a cpu partition
883  */
884 static void
885 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
886 {
887 	group_t		*pgs;
888 	pg_t		*pg;
889 	group_iter_t	i;
890 
891 	ASSERT(MUTEX_HELD(&cpu_lock));
892 
893 	if (cmt_sched_disabled)
894 		return;
895 
896 	pgs = &cp->cpu_pg->pgs;
897 
898 	/*
899 	 * Ensure that the new partition's PG bitset
900 	 * is large enough for all CMT PG's to which cp
901 	 * belongs
902 	 */
903 	group_iter_init(&i);
904 	while ((pg = group_iterate(pgs, &i)) != NULL) {
905 		if (IS_CMT_PG(pg) == 0)
906 			continue;
907 
908 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
909 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
910 	}
911 }
912 
913 /*
914  * Class callback when a CPU is actually moving partitions
915  */
916 static void
917 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
918 {
919 	cpu_t		*cpp;
920 	group_t		*pgs;
921 	pg_t		*pg;
922 	group_iter_t	pg_iter;
923 	pg_cpu_itr_t	cpu_iter;
924 	boolean_t	found;
925 
926 	ASSERT(MUTEX_HELD(&cpu_lock));
927 
928 	if (cmt_sched_disabled)
929 		return;
930 
931 	pgs = &cp->cpu_pg->pgs;
932 	group_iter_init(&pg_iter);
933 
934 	/*
935 	 * Iterate over the CPUs CMT PGs
936 	 */
937 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
938 
939 		if (IS_CMT_PG(pg) == 0)
940 			continue;
941 
942 		/*
943 		 * Add the PG to the bitset in the new partition.
944 		 */
945 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
946 
947 		/*
948 		 * Remove the PG from the bitset in the old partition
949 		 * if the last of the PG's CPUs have left.
950 		 */
951 		found = B_FALSE;
952 		PG_CPU_ITR_INIT(pg, cpu_iter);
953 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
954 			if (cpp == cp)
955 				continue;
956 			if (CPU_ACTIVE(cpp) &&
957 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
958 				found = B_TRUE;
959 				break;
960 			}
961 		}
962 		if (!found)
963 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
964 	}
965 }
966 
967 /*
968  * Class callback when a CPU becomes active (online)
969  *
970  * This is called in a context where CPUs are paused
971  */
972 static void
973 pg_cmt_cpu_active(cpu_t *cp)
974 {
975 	int		err;
976 	group_iter_t	i;
977 	pg_cmt_t	*pg;
978 	group_t		*pgs;
979 
980 	ASSERT(MUTEX_HELD(&cpu_lock));
981 
982 	if (cmt_sched_disabled)
983 		return;
984 
985 	pgs = &cp->cpu_pg->pgs;
986 	group_iter_init(&i);
987 
988 	/*
989 	 * Iterate over the CPU's PGs
990 	 */
991 	while ((pg = group_iterate(pgs, &i)) != NULL) {
992 
993 		if (IS_CMT_PG(pg) == 0)
994 			continue;
995 
996 		/*
997 		 * Move to the next generation since topology is changing
998 		 */
999 		((pghw_t *)pg)->pghw_generation++;
1000 
1001 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1002 		ASSERT(err == 0);
1003 
1004 		/*
1005 		 * If this is the first active CPU in the PG, and it
1006 		 * represents a hardware sharing relationship over which
1007 		 * CMT load balancing is performed, add it as a candidate
1008 		 * for balancing with it's siblings.
1009 		 */
1010 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
1011 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1012 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
1013 			ASSERT(err == 0);
1014 
1015 			/*
1016 			 * If this is a top level PG, add it as a balancing
1017 			 * candidate when balancing within the root lgroup.
1018 			 */
1019 			if (pg->cmt_parent == NULL &&
1020 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1021 				err = group_add(&cmt_root->cl_pgs, pg,
1022 				    GRP_NORESIZE);
1023 				ASSERT(err == 0);
1024 			}
1025 		}
1026 
1027 		/*
1028 		 * Notate the CPU in the PGs active CPU bitset.
1029 		 * Also notate the PG as being active in it's associated
1030 		 * partition
1031 		 */
1032 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1033 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
1034 	}
1035 }
1036 
1037 /*
1038  * Class callback when a CPU goes inactive (offline)
1039  *
1040  * This is called in a context where CPUs are paused
1041  */
1042 static void
1043 pg_cmt_cpu_inactive(cpu_t *cp)
1044 {
1045 	int		err;
1046 	group_t		*pgs;
1047 	pg_cmt_t	*pg;
1048 	cpu_t		*cpp;
1049 	group_iter_t	i;
1050 	pg_cpu_itr_t	cpu_itr;
1051 	boolean_t	found;
1052 
1053 	ASSERT(MUTEX_HELD(&cpu_lock));
1054 
1055 	if (cmt_sched_disabled)
1056 		return;
1057 
1058 	pgs = &cp->cpu_pg->pgs;
1059 	group_iter_init(&i);
1060 
1061 	while ((pg = group_iterate(pgs, &i)) != NULL) {
1062 
1063 		if (IS_CMT_PG(pg) == 0)
1064 			continue;
1065 
1066 		/*
1067 		 * Move to the next generation since topology is changing
1068 		 */
1069 		((pghw_t *)pg)->pghw_generation++;
1070 
1071 		/*
1072 		 * Remove the CPU from the CMT PGs active CPU group
1073 		 * bitmap
1074 		 */
1075 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1076 		ASSERT(err == 0);
1077 
1078 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1079 
1080 		/*
1081 		 * If there are no more active CPUs in this PG over which
1082 		 * load was balanced, remove it as a balancing candidate.
1083 		 */
1084 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
1085 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1086 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1087 			ASSERT(err == 0);
1088 
1089 			if (pg->cmt_parent == NULL &&
1090 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1091 				err = group_remove(&cmt_root->cl_pgs, pg,
1092 				    GRP_NORESIZE);
1093 				ASSERT(err == 0);
1094 			}
1095 		}
1096 
1097 		/*
1098 		 * Assert the number of active CPUs does not exceed
1099 		 * the total number of CPUs in the PG
1100 		 */
1101 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1102 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1103 
1104 		/*
1105 		 * Update the PG bitset in the CPU's old partition
1106 		 */
1107 		found = B_FALSE;
1108 		PG_CPU_ITR_INIT(pg, cpu_itr);
1109 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1110 			if (cpp == cp)
1111 				continue;
1112 			if (CPU_ACTIVE(cpp) &&
1113 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1114 				found = B_TRUE;
1115 				break;
1116 			}
1117 		}
1118 		if (!found) {
1119 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
1120 			    ((pg_t *)pg)->pg_id);
1121 		}
1122 	}
1123 }
1124 
1125 /*
1126  * Return non-zero if the CPU belongs in the given PG
1127  */
1128 static int
1129 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1130 {
1131 	cpu_t	*pg_cpu;
1132 
1133 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1134 
1135 	ASSERT(pg_cpu != NULL);
1136 
1137 	/*
1138 	 * The CPU belongs if, given the nature of the hardware sharing
1139 	 * relationship represented by the PG, the CPU has that
1140 	 * relationship with some other CPU already in the PG
1141 	 */
1142 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1143 		return (1);
1144 
1145 	return (0);
1146 }
1147 
1148 /*
1149  * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1150  */
1151 static void
1152 pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1153 {
1154 	int		i, j, inc, sz;
1155 	int		start, end;
1156 	pg_t		*tmp;
1157 	pg_t		**h = (pg_t **)hier;
1158 
1159 	/*
1160 	 * First sort by number of CPUs
1161 	 */
1162 	inc = size / 2;
1163 	while (inc > 0) {
1164 		for (i = inc; i < size; i++) {
1165 			j = i;
1166 			tmp = h[i];
1167 			while ((j >= inc) &&
1168 			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
1169 				h[j] = h[j - inc];
1170 				j = j - inc;
1171 			}
1172 			h[j] = tmp;
1173 		}
1174 		if (inc == 2)
1175 			inc = 1;
1176 		else
1177 			inc = (inc * 5) / 11;
1178 	}
1179 
1180 	/*
1181 	 * Break ties by asking the platform.
1182 	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
1183 	 */
1184 	for (start = 0; start < size; start++) {
1185 
1186 		/*
1187 		 * Find various contiguous sets of elements,
1188 		 * in the array, with the same number of cpus
1189 		 */
1190 		end = start;
1191 		sz = PG_NUM_CPUS(h[start]);
1192 		while ((end < size) && (sz == PG_NUM_CPUS(h[end])))
1193 			end++;
1194 		/*
1195 		 * Sort each such set of the array by rank
1196 		 */
1197 		for (i = start + 1; i < end; i++) {
1198 			j = i - 1;
1199 			tmp = h[i];
1200 			while (j >= start &&
1201 			    pg_cmt_hier_rank(hier[j],
1202 			    (pg_cmt_t *)tmp) == hier[j]) {
1203 				h[j + 1] = h[j];
1204 				j--;
1205 			}
1206 			h[j + 1] = tmp;
1207 		}
1208 	}
1209 }
1210 
1211 /*
1212  * Return a cmt_lgrp_t * given an lgroup handle.
1213  */
1214 static cmt_lgrp_t *
1215 pg_cmt_find_lgrp(lgrp_handle_t hand)
1216 {
1217 	cmt_lgrp_t	*lgrp;
1218 
1219 	ASSERT(MUTEX_HELD(&cpu_lock));
1220 
1221 	lgrp = cmt_lgrps;
1222 	while (lgrp != NULL) {
1223 		if (lgrp->cl_hand == hand)
1224 			break;
1225 		lgrp = lgrp->cl_next;
1226 	}
1227 	return (lgrp);
1228 }
1229 
1230 /*
1231  * Create a cmt_lgrp_t with the specified handle.
1232  */
1233 static cmt_lgrp_t *
1234 pg_cmt_lgrp_create(lgrp_handle_t hand)
1235 {
1236 	cmt_lgrp_t	*lgrp;
1237 
1238 	ASSERT(MUTEX_HELD(&cpu_lock));
1239 
1240 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1241 
1242 	lgrp->cl_hand = hand;
1243 	lgrp->cl_npgs = 0;
1244 	lgrp->cl_next = cmt_lgrps;
1245 	cmt_lgrps = lgrp;
1246 	group_create(&lgrp->cl_pgs);
1247 
1248 	return (lgrp);
1249 }
1250 
1251 /*
1252  * Interfaces to enable and disable power aware dispatching
1253  * The caller must be holding cpu_lock.
1254  *
1255  * Return 0 on success and -1 on failure.
1256  */
1257 int
1258 cmt_pad_enable(pghw_type_t type)
1259 {
1260 	group_t		*hwset;
1261 	group_iter_t	iter;
1262 	pg_cmt_t	*pg;
1263 
1264 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1265 	ASSERT(MUTEX_HELD(&cpu_lock));
1266 
1267 	if ((hwset = pghw_set_lookup(type)) == NULL ||
1268 	    cmt_hw_blacklisted[type]) {
1269 		/*
1270 		 * Unable to find any instances of the specified type
1271 		 * of power domain, or the power domains have been blacklisted.
1272 		 */
1273 		return (-1);
1274 	}
1275 
1276 	/*
1277 	 * Iterate over the power domains, setting the default dispatcher
1278 	 * policy for power/performance optimization.
1279 	 *
1280 	 * Simply setting the policy isn't enough in the case where the power
1281 	 * domain is an only child of another PG. Because the dispatcher walks
1282 	 * the PG hierarchy in a top down fashion, the higher up PG's policy
1283 	 * will dominate. So promote the power domain above it's parent if both
1284 	 * PG and it's parent have the same CPUs to ensure it's policy
1285 	 * dominates.
1286 	 */
1287 	group_iter_init(&iter);
1288 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1289 		/*
1290 		 * If the power domain is an only child to a parent
1291 		 * not implementing the same policy, promote the child
1292 		 * above the parent to activate the policy.
1293 		 */
1294 		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
1295 		while ((pg->cmt_parent != NULL) &&
1296 		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
1297 		    (PG_NUM_CPUS((pg_t *)pg) ==
1298 		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
1299 			cmt_hier_promote(pg, NULL);
1300 		}
1301 	}
1302 
1303 	return (0);
1304 }
1305 
1306 int
1307 cmt_pad_disable(pghw_type_t type)
1308 {
1309 	group_t		*hwset;
1310 	group_iter_t	iter;
1311 	pg_cmt_t	*pg;
1312 	pg_cmt_t	*child;
1313 
1314 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1315 	ASSERT(MUTEX_HELD(&cpu_lock));
1316 
1317 	if ((hwset = pghw_set_lookup(type)) == NULL) {
1318 		/*
1319 		 * Unable to find any instances of the specified type of
1320 		 * power domain.
1321 		 */
1322 		return (-1);
1323 	}
1324 	/*
1325 	 * Iterate over the power domains, setting the default dispatcher
1326 	 * policy for performance optimization (load balancing).
1327 	 */
1328 	group_iter_init(&iter);
1329 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1330 
1331 		/*
1332 		 * If the power domain has an only child that implements
1333 		 * policy other than load balancing, promote the child
1334 		 * above the power domain to ensure it's policy dominates.
1335 		 */
1336 		if (pg->cmt_children != NULL &&
1337 		    GROUP_SIZE(pg->cmt_children) == 1) {
1338 			child = GROUP_ACCESS(pg->cmt_children, 0);
1339 			if ((child->cmt_policy & CMT_BALANCE) == 0) {
1340 				cmt_hier_promote(child, NULL);
1341 			}
1342 		}
1343 		pg->cmt_policy = CMT_BALANCE;
1344 	}
1345 	return (0);
1346 }
1347 
1348 /* ARGSUSED */
1349 static void
1350 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1351 		    kthread_t *new)
1352 {
1353 	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;
1354 
1355 	if (old == cp->cpu_idle_thread) {
1356 		atomic_add_32(&cmt_pg->cmt_utilization, 1);
1357 	} else if (new == cp->cpu_idle_thread) {
1358 		atomic_add_32(&cmt_pg->cmt_utilization, -1);
1359 	}
1360 }
1361 
1362 /*
1363  * Macro to test whether a thread is currently runnable on a CPU in a PG.
1364  */
1365 #define	THREAD_RUNNABLE_IN_PG(t, pg)					\
1366 	((t)->t_state == TS_RUN &&					\
1367 	    (t)->t_disp_queue->disp_cpu &&				\
1368 	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
1369 	    (t)->t_disp_queue->disp_cpu->cpu_seqid))
1370 
1371 static void
1372 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1373     kthread_t *new)
1374 {
1375 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1376 	cpupm_domain_t	*dom;
1377 	uint32_t	u;
1378 
1379 	if (old == cp->cpu_idle_thread) {
1380 		ASSERT(new != cp->cpu_idle_thread);
1381 		u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
1382 		if (u == 1) {
1383 			/*
1384 			 * Notify the CPU power manager that the domain
1385 			 * is non-idle.
1386 			 */
1387 			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1388 			cpupm_utilization_event(cp, now, dom,
1389 			    CPUPM_DOM_BUSY_FROM_IDLE);
1390 		}
1391 	} else if (new == cp->cpu_idle_thread) {
1392 		ASSERT(old != cp->cpu_idle_thread);
1393 		u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
1394 		if (u == 0) {
1395 			/*
1396 			 * The domain is idle, notify the CPU power
1397 			 * manager.
1398 			 *
1399 			 * Avoid notifying if the thread is simply migrating
1400 			 * between CPUs in the domain.
1401 			 */
1402 			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
1403 				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1404 				cpupm_utilization_event(cp, now, dom,
1405 				    CPUPM_DOM_IDLE_FROM_BUSY);
1406 			}
1407 		}
1408 	}
1409 }
1410 
1411 /* ARGSUSED */
1412 static void
1413 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
1414 {
1415 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1416 	cpupm_domain_t	*dom;
1417 
1418 	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1419 	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
1420 }
1421 
1422 /*
1423  * Return the name of the CMT scheduling policy
1424  * being implemented across this PG
1425  */
1426 static char *
1427 pg_cmt_policy_name(pg_t *pg)
1428 {
1429 	pg_cmt_policy_t policy;
1430 
1431 	policy = ((pg_cmt_t *)pg)->cmt_policy;
1432 
1433 	if (policy & CMT_AFFINITY) {
1434 		if (policy & CMT_BALANCE)
1435 			return ("Load Balancing & Affinity");
1436 		else if (policy & CMT_COALESCE)
1437 			return ("Load Coalescence & Affinity");
1438 		else
1439 			return ("Affinity");
1440 	} else {
1441 		if (policy & CMT_BALANCE)
1442 			return ("Load Balancing");
1443 		else if (policy & CMT_COALESCE)
1444 			return ("Load Coalescence");
1445 		else
1446 			return ("None");
1447 	}
1448 }
1449 
1450 /*
1451  * Prune PG, and all other instances of PG's hardware sharing relationship
1452  * from the CMT PG hierarchy.
1453  *
1454  * This routine operates on the CPU specific processor group data (for the CPUs
1455  * in the PG being pruned), and may be invoked from a context where one CPU's
1456  * PG data is under construction. In this case the argument "pgdata", if not
1457  * NULL, is a reference to the CPU's under-construction PG data.
1458  */
1459 static int
1460 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1461 {
1462 	group_t		*hwset, *children;
1463 	int		i, j, r, size = *sz;
1464 	group_iter_t	hw_iter, child_iter;
1465 	pg_cpu_itr_t	cpu_iter;
1466 	pg_cmt_t	*pg, *child;
1467 	cpu_t		*cpu;
1468 	int		cap_needed;
1469 	pghw_type_t	hw;
1470 
1471 	ASSERT(MUTEX_HELD(&cpu_lock));
1472 
1473 	hw = ((pghw_t *)pg_bad)->pghw_hw;
1474 
1475 	if (hw == PGHW_POW_ACTIVE) {
1476 		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
1477 		    "Event Based CPUPM Unavailable");
1478 	} else if (hw == PGHW_POW_IDLE) {
1479 		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
1480 		    "Dispatcher assisted CPUPM disabled.");
1481 	}
1482 
1483 	/*
1484 	 * Find and eliminate the PG from the lineage.
1485 	 */
1486 	for (i = 0; i < size; i++) {
1487 		if (lineage[i] == pg_bad) {
1488 			for (j = i; j < size - 1; j++)
1489 				lineage[j] = lineage[j + 1];
1490 			*sz = size - 1;
1491 			break;
1492 		}
1493 	}
1494 
1495 	/*
1496 	 * We'll prune all instances of the hardware sharing relationship
1497 	 * represented by pg. But before we do that (and pause CPUs) we need
1498 	 * to ensure the hierarchy's groups are properly sized.
1499 	 */
1500 	hwset = pghw_set_lookup(hw);
1501 
1502 	/*
1503 	 * Blacklist the hardware so future processor groups of this type won't
1504 	 * participate in CMT thread placement.
1505 	 *
1506 	 * XXX
1507 	 * For heterogeneous system configurations, this might be overkill.
1508 	 * We may only need to blacklist the illegal PGs, and other instances
1509 	 * of this hardware sharing relationship may be ok.
1510 	 */
1511 	cmt_hw_blacklisted[hw] = 1;
1512 
1513 	/*
1514 	 * For each of the PGs being pruned, ensure sufficient capacity in
1515 	 * the siblings set for the PG's children
1516 	 */
1517 	group_iter_init(&hw_iter);
1518 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1519 		/*
1520 		 * PG is being pruned, but if it is bringing up more than
1521 		 * one child, ask for more capacity in the siblings group.
1522 		 */
1523 		cap_needed = 0;
1524 		if (pg->cmt_children &&
1525 		    GROUP_SIZE(pg->cmt_children) > 1) {
1526 			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
1527 
1528 			group_expand(pg->cmt_siblings,
1529 			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);
1530 
1531 			/*
1532 			 * If this is a top level group, also ensure the
1533 			 * capacity in the root lgrp level CMT grouping.
1534 			 */
1535 			if (pg->cmt_parent == NULL &&
1536 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1537 				group_expand(&cmt_root->cl_pgs,
1538 				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1539 				cmt_root->cl_npgs += cap_needed;
1540 			}
1541 		}
1542 	}
1543 
1544 	/*
1545 	 * We're operating on the PG hierarchy. Pause CPUs to ensure
1546 	 * exclusivity with respect to the dispatcher.
1547 	 */
1548 	pause_cpus(NULL);
1549 
1550 	/*
1551 	 * Prune all PG instances of the hardware sharing relationship
1552 	 * represented by pg.
1553 	 */
1554 	group_iter_init(&hw_iter);
1555 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1556 
1557 		/*
1558 		 * Remove PG from it's group of siblings, if it's there.
1559 		 */
1560 		if (pg->cmt_siblings) {
1561 			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1562 		}
1563 		if (pg->cmt_parent == NULL &&
1564 		    pg->cmt_siblings != &cmt_root->cl_pgs) {
1565 			(void) group_remove(&cmt_root->cl_pgs, pg,
1566 			    GRP_NORESIZE);
1567 		}
1568 
1569 		/*
1570 		 * Indicate that no CMT policy will be implemented across
1571 		 * this PG.
1572 		 */
1573 		pg->cmt_policy = CMT_NO_POLICY;
1574 
1575 		/*
1576 		 * Move PG's children from it's children set to it's parent's
1577 		 * children set. Note that the parent's children set, and PG's
1578 		 * siblings set are the same thing.
1579 		 *
1580 		 * Because we are iterating over the same group that we are
1581 		 * operating on (removing the children), first add all of PG's
1582 		 * children to the parent's children set, and once we are done
1583 		 * iterating, empty PG's children set.
1584 		 */
1585 		if (pg->cmt_children != NULL) {
1586 			children = pg->cmt_children;
1587 
1588 			group_iter_init(&child_iter);
1589 			while ((child = group_iterate(children, &child_iter))
1590 			    != NULL) {
1591 				if (pg->cmt_siblings != NULL) {
1592 					r = group_add(pg->cmt_siblings, child,
1593 					    GRP_NORESIZE);
1594 					ASSERT(r == 0);
1595 
1596 					if (pg->cmt_parent == NULL &&
1597 					    pg->cmt_siblings !=
1598 					    &cmt_root->cl_pgs) {
1599 						r = group_add(&cmt_root->cl_pgs,
1600 						    child, GRP_NORESIZE);
1601 						ASSERT(r == 0);
1602 					}
1603 				}
1604 			}
1605 			group_empty(pg->cmt_children);
1606 		}
1607 
1608 		/*
1609 		 * Reset the callbacks to the defaults
1610 		 */
1611 		pg_callback_set_defaults((pg_t *)pg);
1612 
1613 		/*
1614 		 * Update all the CPU lineages in each of PG's CPUs
1615 		 */
1616 		PG_CPU_ITR_INIT(pg, cpu_iter);
1617 		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1618 			pg_cmt_t	*cpu_pg;
1619 			group_iter_t	liter;	/* Iterator for the lineage */
1620 			cpu_pg_t	*cpd;	/* CPU's PG data */
1621 
1622 			/*
1623 			 * The CPU's lineage is under construction still
1624 			 * references the bootstrap CPU PG data structure.
1625 			 */
1626 			if (pg_cpu_is_bootstrapped(cpu))
1627 				cpd = pgdata;
1628 			else
1629 				cpd = cpu->cpu_pg;
1630 
1631 			/*
1632 			 * Iterate over the CPU's PGs updating the children
1633 			 * of the PG being promoted, since they have a new
1634 			 * parent and siblings set.
1635 			 */
1636 			group_iter_init(&liter);
1637 			while ((cpu_pg = group_iterate(&cpd->pgs,
1638 			    &liter)) != NULL) {
1639 				if (cpu_pg->cmt_parent == pg) {
1640 					cpu_pg->cmt_parent = pg->cmt_parent;
1641 					cpu_pg->cmt_siblings = pg->cmt_siblings;
1642 				}
1643 			}
1644 
1645 			/*
1646 			 * Update the CPU's lineages
1647 			 *
1648 			 * Remove the PG from the CPU's group used for CMT
1649 			 * scheduling.
1650 			 */
1651 			(void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
1652 		}
1653 	}
1654 	start_cpus();
1655 	return (0);
1656 }
1657 
1658 /*
1659  * Disable CMT scheduling
1660  */
1661 static void
1662 pg_cmt_disable(void)
1663 {
1664 	cpu_t		*cpu;
1665 
1666 	ASSERT(MUTEX_HELD(&cpu_lock));
1667 
1668 	pause_cpus(NULL);
1669 	cpu = cpu_list;
1670 
1671 	do {
1672 		if (cpu->cpu_pg)
1673 			group_empty(&cpu->cpu_pg->cmt_pgs);
1674 	} while ((cpu = cpu->cpu_next) != cpu_list);
1675 
1676 	cmt_sched_disabled = 1;
1677 	start_cpus();
1678 	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
1679 }
1680 
1681 /*
1682  * CMT lineage validation
1683  *
1684  * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
1685  * of the PGs in a CPU's lineage. This is necessary because it's possible that
1686  * some groupings (power domain groupings in particular) may be defined by
1687  * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
1688  * possible to integrate those groupings into the CMT PG hierarchy, if doing
1689  * so would violate the subset invariant of the hierarchy, which says that
1690  * a PG must be subset of its parent (if it has one).
1691  *
1692  * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
1693  * would result in a violation of this invariant. If a violation is found,
1694  * and the PG is of a grouping type who's definition is known to originate from
1695  * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
1696  * PG (and all other instances PG's sharing relationship type) from the CMT
1697  * hierarchy. Further, future instances of that sharing relationship type won't
1698  * be added. If the grouping definition doesn't originate from suspect
1699  * sources, then pg_cmt_disable() will be invoked to log an error, and disable
1700  * CMT scheduling altogether.
1701  *
1702  * This routine is invoked after the CPU has been added to the PGs in which
1703  * it belongs, but before those PGs have been added to (or had their place
1704  * adjusted in) the CMT PG hierarchy.
1705  *
1706  * The first argument is the CPUs PG lineage (essentially an array of PGs in
1707  * which the CPU belongs) that has already been sorted in ascending order
1708  * by CPU count. Some of the PGs in the CPUs lineage may already have other
1709  * CPUs in them, and have already been integrated into the CMT hierarchy.
1710  *
1711  * The addition of this new CPU to these pre-existing PGs means that those
1712  * PGs may need to be promoted up in the hierarchy to satisfy the subset
1713  * invariant. In additon to testing the subset invariant for the lineage,
1714  * this routine also verifies that the addition of the new CPU to the
1715  * existing PGs wouldn't cause the subset invariant to be violated in
1716  * the exiting lineages.
1717  *
1718  * This routine will normally return one of the following:
1719  * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
1720  * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
1721  *
1722  * Otherwise, this routine will return a value indicating which error it
1723  * was unable to recover from (and set cmt_lineage_status along the way).
1724  *
1725  * This routine operates on the CPU specific processor group data (for the CPU
1726  * whose lineage is being validated), which is under-construction.
1727  * "pgdata" is a reference to the CPU's under-construction PG data.
1728  * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
1729  */
1730 static cmt_lineage_validation_t
1731 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1732 {
1733 	int		i, j, size;
1734 	pg_cmt_t	*pg, *pg_next, *pg_bad, *pg_tmp, *parent;
1735 	cpu_t		*cp;
1736 	pg_cpu_itr_t	cpu_iter;
1737 	lgrp_handle_t	lgrp;
1738 
1739 	ASSERT(MUTEX_HELD(&cpu_lock));
1740 
1741 revalidate:
1742 	size = *sz;
1743 	pg_bad = NULL;
1744 	lgrp = LGRP_NULL_HANDLE;
1745 	for (i = 0; i < size; i++) {
1746 
1747 		pg = lineage[i];
1748 		if (i < size - 1)
1749 			pg_next = lineage[i + 1];
1750 		else
1751 			pg_next = NULL;
1752 
1753 		/*
1754 		 * We assume that the lineage has already been sorted
1755 		 * by the number of CPUs. In fact, we depend on it.
1756 		 */
1757 		ASSERT(pg_next == NULL ||
1758 		    (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));
1759 
1760 		/*
1761 		 * The CPUs PG lineage was passed as the first argument to
1762 		 * this routine and contains the sorted list of the CPU's
1763 		 * PGs. Ultimately, the ordering of the PGs in that list, and
1764 		 * the ordering as traversed by the cmt_parent list must be
1765 		 * the same. PG promotion will be used as the mechanism to
1766 		 * achieve this, but first we need to look for cases where
1767 		 * promotion will be necessary, and validate that will be
1768 		 * possible without violating the subset invarient described
1769 		 * above.
1770 		 *
1771 		 * Since the PG topology is in the middle of being changed, we
1772 		 * need to check whether the PG's existing parent (if any) is
1773 		 * part of this CPU's lineage (and therefore should contain
1774 		 * the new CPU). If not, it means that the addition of the
1775 		 * new CPU should have made this PG have more CPUs than its
1776 		 * parent (and other ancestors not in the same lineage) and
1777 		 * will need to be promoted into place.
1778 		 *
1779 		 * We need to verify all of this to defend against a buggy
1780 		 * BIOS giving bad power domain CPU groupings. Sigh.
1781 		 */
1782 		parent = pg->cmt_parent;
1783 		while (parent != NULL) {
1784 			/*
1785 			 * Determine if the parent/ancestor is in this lineage
1786 			 */
1787 			pg_tmp = NULL;
1788 			for (j = 0; (j < size) && (pg_tmp != parent); j++) {
1789 				pg_tmp = lineage[j];
1790 			}
1791 			if (pg_tmp == parent) {
1792 				/*
1793 				 * It's in the lineage. The concentricity
1794 				 * checks will handle the rest.
1795 				 */
1796 				break;
1797 			}
1798 			/*
1799 			 * If it is not in the lineage, PG will eventually
1800 			 * need to be promoted above it. Verify the ancestor
1801 			 * is a proper subset. There is still an error if
1802 			 * the ancestor has the same number of CPUs as PG,
1803 			 * since that would imply it should be in the lineage,
1804 			 * and we already know it isn't.
1805 			 */
1806 			if (PG_NUM_CPUS((pg_t *)parent) >=
1807 			    PG_NUM_CPUS((pg_t *)pg)) {
1808 				/*
1809 				 * Not a proper subset if the parent/ancestor
1810 				 * has the same or more CPUs than PG.
1811 				 */
1812 				cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE;
1813 				goto handle_error;
1814 			}
1815 			parent = parent->cmt_parent;
1816 		}
1817 
1818 		/*
1819 		 * Walk each of the CPUs in the PGs group and perform
1820 		 * consistency checks along the way.
1821 		 */
1822 		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
1823 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1824 			/*
1825 			 * Verify that there aren't any CPUs contained in PG
1826 			 * that the next PG in the lineage (which is larger
1827 			 * or same size) doesn't also contain.
1828 			 */
1829 			if (pg_next != NULL &&
1830 			    pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
1831 				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
1832 				goto handle_error;
1833 			}
1834 
1835 			/*
1836 			 * Verify that all the CPUs in the PG are in the same
1837 			 * lgroup.
1838 			 */
1839 			if (lgrp == LGRP_NULL_HANDLE) {
1840 				lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
1841 			} else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
1842 				cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
1843 				goto handle_error;
1844 			}
1845 		}
1846 	}
1847 
1848 handle_error:
1849 	/*
1850 	 * Some of these validation errors can result when the CPU grouping
1851 	 * information is derived from buggy sources (for example, incorrect
1852 	 * ACPI tables on x86 systems).
1853 	 *
1854 	 * We'll try to recover in such cases by pruning out the illegal
1855 	 * groupings from the PG hierarchy, which means that we won't optimize
1856 	 * for those levels, but we will for the remaining ones.
1857 	 */
1858 	switch (cmt_lineage_status) {
1859 	case CMT_LINEAGE_VALID:
1860 	case CMT_LINEAGE_REPAIRED:
1861 		break;
1862 	case CMT_LINEAGE_PG_SPANS_LGRPS:
1863 		/*
1864 		 * We've detected a PG whose CPUs span lgroups.
1865 		 *
1866 		 * This isn't supported, as the dispatcher isn't allowed to
1867 		 * to do CMT thread placement across lgroups, as this would
1868 		 * conflict with policies implementing MPO thread affinity.
1869 		 *
1870 		 * If the PG is of a sharing relationship type known to
1871 		 * legitimately span lgroups, specify that no CMT thread
1872 		 * placement policy should be implemented, and prune the PG
1873 		 * from the existing CMT PG hierarchy.
1874 		 *
1875 		 * Otherwise, fall though to the case below for handling.
1876 		 */
1877 		if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) {
1878 			if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1879 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1880 				goto revalidate;
1881 			}
1882 		}
1883 		/*LINTED*/
1884 	case CMT_LINEAGE_NON_PROMOTABLE:
1885 		/*
1886 		 * We've detected a PG that already exists in another CPU's
1887 		 * lineage that cannot cannot legally be promoted into place
1888 		 * without breaking the invariants of the hierarchy.
1889 		 */
1890 		if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1891 			if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1892 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1893 				goto revalidate;
1894 			}
1895 		}
1896 		/*
1897 		 * Something went wrong trying to prune out the bad level.
1898 		 * Disable CMT scheduling altogether.
1899 		 */
1900 		pg_cmt_disable();
1901 		break;
1902 	case CMT_LINEAGE_NON_CONCENTRIC:
1903 		/*
1904 		 * We've detected a non-concentric PG lineage, which means that
1905 		 * there's a PG in the lineage that has CPUs that the next PG
1906 		 * over in the lineage (which is the same size or larger)
1907 		 * doesn't have.
1908 		 *
1909 		 * In this case, we examine the two PGs to see if either
1910 		 * grouping is defined by potentially buggy sources.
1911 		 *
1912 		 * If one has less CPUs than the other, and contains CPUs
1913 		 * not found in the parent, and it is an untrusted enumeration,
1914 		 * then prune it. If both have the same number of CPUs, then
1915 		 * prune the one that is untrusted.
1916 		 *
1917 		 * This process repeats until we have a concentric lineage,
1918 		 * or we would have to prune out level derived from what we
1919 		 * thought was a reliable source, in which case CMT scheduling
1920 		 * is disabled altogether.
1921 		 */
1922 		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
1923 		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
1924 			pg_bad = pg;
1925 		} else if (PG_NUM_CPUS((pg_t *)pg) ==
1926 		    PG_NUM_CPUS((pg_t *)pg_next)) {
1927 			if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
1928 				pg_bad = pg_next;
1929 			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1930 				pg_bad = pg;
1931 			}
1932 		}
1933 		if (pg_bad) {
1934 			if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
1935 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1936 				goto revalidate;
1937 			}
1938 		}
1939 		/*
1940 		 * Something went wrong trying to identify and/or prune out
1941 		 * the bad level. Disable CMT scheduling altogether.
1942 		 */
1943 		pg_cmt_disable();
1944 		break;
1945 	default:
1946 		/*
1947 		 * If we're here, we've encountered a validation error for
1948 		 * which we don't know how to recover. In this case, disable
1949 		 * CMT scheduling altogether.
1950 		 */
1951 		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1952 		pg_cmt_disable();
1953 	}
1954 	return (cmt_lineage_status);
1955 }
1956