xref: /titanic_50/usr/src/uts/common/disp/cmt.c (revision 52244c0958bdf281ca42932b449f644b4decfdc2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/systm.h>
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/thread.h>
29 #include <sys/cpuvar.h>
30 #include <sys/cpupart.h>
31 #include <sys/kmem.h>
32 #include <sys/cmn_err.h>
33 #include <sys/kstat.h>
34 #include <sys/processor.h>
35 #include <sys/disp.h>
36 #include <sys/group.h>
37 #include <sys/pghw.h>
38 #include <sys/bitset.h>
39 #include <sys/lgrp.h>
40 #include <sys/cmt.h>
41 #include <sys/cpu_pm.h>
42 
43 /*
44  * CMT scheduler / dispatcher support
45  *
46  * This file implements CMT scheduler support using Processor Groups.
47  * The CMT processor group class creates and maintains the CMT class
48  * specific processor group pg_cmt_t.
49  *
50  * ---------------------------- <-- pg_cmt_t *
51  * | pghw_t                   |
52  * ----------------------------
53  * | CMT class specific data  |
54  * | - hierarchy linkage      |
55  * | - CMT load balancing data|
56  * | - active CPU group/bitset|
57  * ----------------------------
58  *
59  * The scheduler/dispatcher leverages knowledge of the performance
60  * relevant CMT sharing relationships existing between cpus to implement
61  * optimized affinity, load balancing, and coalescence policies.
62  *
63  * Load balancing policy seeks to improve performance by minimizing
64  * contention over shared processor resources / facilities, Affinity
65  * policies seek to improve cache and TLB utilization. Coalescence
66  * policies improve resource utilization and ultimately power efficiency.
67  *
68  * The CMT PGs created by this class are already arranged into a
69  * hierarchy (which is done in the pghw layer). To implement the top-down
70  * CMT load balancing algorithm, the CMT PGs additionally maintain
71  * parent, child and sibling hierarchy relationships.
72  * Parent PGs always contain a superset of their children(s) resources,
73  * each PG can have at most one parent, and siblings are the group of PGs
74  * sharing the same parent.
75  *
76  * On UMA based systems, the CMT load balancing algorithm begins by balancing
77  * load across the group of top level PGs in the system hierarchy.
78  * On NUMA systems, the CMT load balancing algorithm balances load across the
79  * group of top level PGs in each leaf lgroup...but for root homed threads,
80  * is willing to balance against all the top level PGs in the system.
81  *
82  * Groups of top level PGs are maintained to implement the above, one for each
83  * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
84  * root lgroup) that contains all the top level PGs in the system.
85  */
86 static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
87 static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
88 						/* used for null_proc_lpa */
89 cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */
90 
91 static int		is_cpu0 = 1; /* true if this is boot CPU context */
92 
93 /*
94  * Array of hardware sharing relationships that are blacklisted.
95  * CMT scheduling optimizations won't be performed for blacklisted sharing
96  * relationships.
97  */
98 static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
99 
100 /*
101  * Set this to non-zero to disable CMT scheduling
102  * This must be done via kmdb -d, as /etc/system will be too late
103  */
104 int			cmt_sched_disabled = 0;
105 
106 /*
107  * Status codes for CMT lineage validation
108  * See pg_cmt_lineage_validate() below
109  */
110 typedef enum cmt_lineage_validation {
111 	CMT_LINEAGE_VALID,
112 	CMT_LINEAGE_NON_CONCENTRIC,
113 	CMT_LINEAGE_PG_SPANS_LGRPS,
114 	CMT_LINEAGE_NON_PROMOTABLE,
115 	CMT_LINEAGE_REPAIRED,
116 	CMT_LINEAGE_UNRECOVERABLE
117 } cmt_lineage_validation_t;
118 
119 /*
120  * Status of the current lineage under construction.
121  * One must be holding cpu_lock to change this.
122  */
123 cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;
124 
125 /*
126  * Power domain definitions (on x86) are defined by ACPI, and
127  * therefore may be subject to BIOS bugs.
128  */
129 #define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)
130 
131 /*
132  * Macro to test if PG is managed by the CMT PG class
133  */
134 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
135 
136 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
137 
138 static pg_t		*pg_cmt_alloc();
139 static void		pg_cmt_free(pg_t *);
140 static void		pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
141 static void		pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
142 static void		pg_cmt_cpu_active(cpu_t *);
143 static void		pg_cmt_cpu_inactive(cpu_t *);
144 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
145 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
146 static char		*pg_cmt_policy_name(pg_t *);
147 static void		pg_cmt_hier_sort(pg_cmt_t **, int);
148 static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
149 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
150 static int		pg_cmt_hw(pghw_type_t);
151 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
152 static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
153 static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
154 			    kthread_t *, kthread_t *);
155 static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
156 			    kthread_t *, kthread_t *);
157 static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
158 static cmt_lineage_validation_t	pg_cmt_lineage_validate(pg_cmt_t **, int *,
159 			    cpu_pg_t *);
160 
161 /*
162  * CMT PG ops
163  */
164 struct pg_ops pg_ops_cmt = {
165 	pg_cmt_alloc,
166 	pg_cmt_free,
167 	pg_cmt_cpu_init,
168 	pg_cmt_cpu_fini,
169 	pg_cmt_cpu_active,
170 	pg_cmt_cpu_inactive,
171 	pg_cmt_cpupart_in,
172 	NULL,			/* cpupart_out */
173 	pg_cmt_cpupart_move,
174 	pg_cmt_cpu_belongs,
175 	pg_cmt_policy_name,
176 };
177 
178 /*
179  * Initialize the CMT PG class
180  */
181 void
182 pg_cmt_class_init(void)
183 {
184 	if (cmt_sched_disabled)
185 		return;
186 
187 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
188 }
189 
190 /*
191  * Called to indicate a new CPU has started up so
192  * that either t0 or the slave startup thread can
193  * be accounted for.
194  */
195 void
196 pg_cmt_cpu_startup(cpu_t *cp)
197 {
198 	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
199 	    cp->cpu_thread);
200 }
201 
202 /*
203  * Return non-zero if thread can migrate between "from" and "to"
204  * without a performance penalty
205  */
206 int
207 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
208 {
209 	if (from->cpu_physid->cpu_cacheid ==
210 	    to->cpu_physid->cpu_cacheid)
211 		return (1);
212 	return (0);
213 }
214 
215 /*
216  * CMT class specific PG allocation
217  */
218 static pg_t *
219 pg_cmt_alloc(void)
220 {
221 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
222 }
223 
224 /*
225  * Class specific PG de-allocation
226  */
227 static void
228 pg_cmt_free(pg_t *pg)
229 {
230 	ASSERT(pg != NULL);
231 	ASSERT(IS_CMT_PG(pg));
232 
233 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
234 }
235 
236 /*
237  * Given a hardware sharing relationship, return which dispatcher
238  * policies should be implemented to optimize performance and efficiency
239  */
240 static pg_cmt_policy_t
241 pg_cmt_policy(pghw_type_t hw)
242 {
243 	pg_cmt_policy_t p;
244 
245 	/*
246 	 * Give the platform a chance to override the default
247 	 */
248 	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
249 		return (p);
250 
251 	switch (hw) {
252 	case PGHW_IPIPE:
253 	case PGHW_FPU:
254 	case PGHW_PROCNODE:
255 	case PGHW_CHIP:
256 		return (CMT_BALANCE);
257 	case PGHW_CACHE:
258 		return (CMT_AFFINITY | CMT_BALANCE);
259 	case PGHW_POW_ACTIVE:
260 	case PGHW_POW_IDLE:
261 		return (CMT_BALANCE);
262 	default:
263 		return (CMT_NO_POLICY);
264 	}
265 }
266 
267 /*
268  * Rank the importance of optimizing for the pg1 relationship vs.
269  * the pg2 relationship.
270  */
271 static pg_cmt_t *
272 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
273 {
274 	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
275 	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
276 
277 	/*
278 	 * A power domain is only important if CPUPM is enabled.
279 	 */
280 	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
281 		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
282 			return (pg2);
283 		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
284 			return (pg1);
285 	}
286 
287 	/*
288 	 * Otherwise, ask the platform
289 	 */
290 	if (pg_plat_hw_rank(hw1, hw2) == hw1)
291 		return (pg1);
292 	else
293 		return (pg2);
294 }
295 
296 /*
297  * Initialize CMT callbacks for the given PG
298  */
299 static void
300 cmt_callback_init(pg_t *pg)
301 {
302 	/*
303 	 * Stick with the default callbacks if there isn't going to be
304 	 * any CMT thread placement optimizations implemented.
305 	 */
306 	if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY)
307 		return;
308 
309 	switch (((pghw_t *)pg)->pghw_hw) {
310 	case PGHW_POW_ACTIVE:
311 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
312 		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
313 		break;
314 	default:
315 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
316 
317 	}
318 }
319 
320 /*
321  * Promote PG above it's current parent.
322  * This is only legal if PG has an equal or greater number of CPUs than its
323  * parent.
324  *
325  * This routine operates on the CPU specific processor group data (for the CPUs
326  * in the PG being promoted), and may be invoked from a context where one CPU's
327  * PG data is under construction. In this case the argument "pgdata", if not
328  * NULL, is a reference to the CPU's under-construction PG data.
329  */
330 static void
331 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
332 {
333 	pg_cmt_t	*parent;
334 	group_t		*children;
335 	cpu_t		*cpu;
336 	group_iter_t	iter;
337 	pg_cpu_itr_t	cpu_iter;
338 	int		r;
339 	int		err;
340 	int		nchildren;
341 
342 	ASSERT(MUTEX_HELD(&cpu_lock));
343 
344 	parent = pg->cmt_parent;
345 	if (parent == NULL) {
346 		/*
347 		 * Nothing to do
348 		 */
349 		return;
350 	}
351 
352 	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
353 
354 	/*
355 	 * We're changing around the hierarchy, which is actively traversed
356 	 * by the dispatcher. Pause CPUS to ensure exclusivity.
357 	 */
358 	pause_cpus(NULL, NULL);
359 
360 	/*
361 	 * If necessary, update the parent's sibling set, replacing parent
362 	 * with PG.
363 	 */
364 	if (parent->cmt_siblings) {
365 		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
366 		    != -1) {
367 			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
368 			ASSERT(r != -1);
369 		}
370 	}
371 
372 	/*
373 	 * If the parent is at the top of the hierarchy, replace it's entry
374 	 * in the root lgroup's group of top level PGs.
375 	 */
376 	if (parent->cmt_parent == NULL &&
377 	    parent->cmt_siblings != &cmt_root->cl_pgs) {
378 		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
379 		    != -1) {
380 			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
381 			ASSERT(r != -1);
382 		}
383 	}
384 
385 	/*
386 	 * We assume (and therefore assert) that the PG being promoted is an
387 	 * only child of it's parent. Update the parent's children set
388 	 * replacing PG's entry with the parent (since the parent is becoming
389 	 * the child). Then have PG and the parent swap children sets and
390 	 * children counts.
391 	 */
392 	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
393 	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
394 		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
395 		ASSERT(r != -1);
396 	}
397 
398 	children = pg->cmt_children;
399 	pg->cmt_children = parent->cmt_children;
400 	parent->cmt_children = children;
401 
402 	nchildren = pg->cmt_nchildren;
403 	pg->cmt_nchildren = parent->cmt_nchildren;
404 	parent->cmt_nchildren = nchildren;
405 
406 	/*
407 	 * Update the sibling references for PG and it's parent
408 	 */
409 	pg->cmt_siblings = parent->cmt_siblings;
410 	parent->cmt_siblings = pg->cmt_children;
411 
412 	/*
413 	 * Update any cached lineages in the per CPU pg data.
414 	 */
415 	PG_CPU_ITR_INIT(pg, cpu_iter);
416 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
417 		int		idx;
418 		int		sz;
419 		pg_cmt_t	*cpu_pg;
420 		cpu_pg_t	*pgd;	/* CPU's PG data */
421 
422 		/*
423 		 * The CPU's whose lineage is under construction still
424 		 * references the bootstrap CPU PG data structure.
425 		 */
426 		if (pg_cpu_is_bootstrapped(cpu))
427 			pgd = pgdata;
428 		else
429 			pgd = cpu->cpu_pg;
430 
431 		/*
432 		 * Iterate over the CPU's PGs updating the children
433 		 * of the PG being promoted, since they have a new parent.
434 		 */
435 		group_iter_init(&iter);
436 		while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
437 			if (cpu_pg->cmt_parent == pg) {
438 				cpu_pg->cmt_parent = parent;
439 			}
440 		}
441 
442 		/*
443 		 * Update the CMT load balancing lineage
444 		 */
445 		if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
446 			/*
447 			 * Unless this is the CPU who's lineage is being
448 			 * constructed, the PG being promoted should be
449 			 * in the lineage.
450 			 */
451 			ASSERT(pg_cpu_is_bootstrapped(cpu));
452 			continue;
453 		}
454 
455 		ASSERT(idx > 0);
456 		ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);
457 
458 		/*
459 		 * Have the child and the parent swap places in the CPU's
460 		 * lineage
461 		 */
462 		group_remove_at(&pgd->cmt_pgs, idx);
463 		group_remove_at(&pgd->cmt_pgs, idx - 1);
464 		err = group_add_at(&pgd->cmt_pgs, parent, idx);
465 		ASSERT(err == 0);
466 		err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
467 		ASSERT(err == 0);
468 
469 		/*
470 		 * Ensure cmt_lineage references CPU's leaf PG.
471 		 * Since cmt_pgs is top-down ordered, the bottom is the last
472 		 * element.
473 		 */
474 		if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0)
475 			pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1);
476 	}
477 
478 	/*
479 	 * Update the parent references for PG and it's parent
480 	 */
481 	pg->cmt_parent = parent->cmt_parent;
482 	parent->cmt_parent = pg;
483 
484 	start_cpus();
485 }
486 
487 /*
488  * CMT class callback for a new CPU entering the system
489  *
490  * This routine operates on the CPU specific processor group data (for the CPU
491  * being initialized). The argument "pgdata" is a reference to the CPU's PG
492  * data to be constructed.
493  *
494  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
495  * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
496  * calls must be careful to operate only on the "pgdata" argument, and not
497  * cp->cpu_pg.
498  */
499 static void
500 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
501 {
502 	pg_cmt_t	*pg;
503 	group_t		*cmt_pgs;
504 	int		levels, level;
505 	pghw_type_t	hw;
506 	pg_t		*pg_cache = NULL;
507 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
508 	lgrp_handle_t	lgrp_handle;
509 	cmt_lgrp_t	*lgrp;
510 	cmt_lineage_validation_t	lineage_status;
511 
512 	ASSERT(MUTEX_HELD(&cpu_lock));
513 	ASSERT(pg_cpu_is_bootstrapped(cp));
514 
515 	if (cmt_sched_disabled)
516 		return;
517 
518 	/*
519 	 * A new CPU is coming into the system.
520 	 * Interrogate the platform to see if the CPU
521 	 * has any performance or efficiency relevant
522 	 * sharing relationships
523 	 */
524 	cmt_pgs = &pgdata->cmt_pgs;
525 	pgdata->cmt_lineage = NULL;
526 
527 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
528 	levels = 0;
529 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
530 
531 		pg_cmt_policy_t	policy;
532 
533 		/*
534 		 * We're only interested in the hw sharing relationships
535 		 * for which we know how to optimize.
536 		 */
537 		policy = pg_cmt_policy(hw);
538 		if (policy == CMT_NO_POLICY ||
539 		    pg_plat_hw_shared(cp, hw) == 0)
540 			continue;
541 
542 		/*
543 		 * We will still create the PGs for hardware sharing
544 		 * relationships that have been blacklisted, but won't
545 		 * implement CMT thread placement optimizations against them.
546 		 */
547 		if (cmt_hw_blacklisted[hw] == 1)
548 			policy = CMT_NO_POLICY;
549 
550 		/*
551 		 * Find (or create) the PG associated with
552 		 * the hw sharing relationship in which cp
553 		 * belongs.
554 		 *
555 		 * Determine if a suitable PG already
556 		 * exists, or if one needs to be created.
557 		 */
558 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
559 		if (pg == NULL) {
560 			/*
561 			 * Create a new one.
562 			 * Initialize the common...
563 			 */
564 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
565 
566 			/* ... physical ... */
567 			pghw_init((pghw_t *)pg, cp, hw);
568 
569 			/*
570 			 * ... and CMT specific portions of the
571 			 * structure.
572 			 */
573 			pg->cmt_policy = policy;
574 
575 			/* CMT event callbacks */
576 			cmt_callback_init((pg_t *)pg);
577 
578 			bitset_init(&pg->cmt_cpus_actv_set);
579 			group_create(&pg->cmt_cpus_actv);
580 		} else {
581 			ASSERT(IS_CMT_PG(pg));
582 		}
583 
584 		((pghw_t *)pg)->pghw_generation++;
585 
586 		/* Add the CPU to the PG */
587 		pg_cpu_add((pg_t *)pg, cp, pgdata);
588 
589 		/*
590 		 * Ensure capacity of the active CPU group/bitset
591 		 */
592 		group_expand(&pg->cmt_cpus_actv,
593 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
594 
595 		if (cp->cpu_seqid >=
596 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
597 			bitset_resize(&pg->cmt_cpus_actv_set,
598 			    cp->cpu_seqid + 1);
599 		}
600 
601 		/*
602 		 * Build a lineage of CMT PGs for load balancing / coalescence
603 		 */
604 		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
605 			cpu_cmt_hier[levels++] = pg;
606 		}
607 
608 		/* Cache this for later */
609 		if (hw == PGHW_CACHE)
610 			pg_cache = (pg_t *)pg;
611 	}
612 
613 	group_expand(cmt_pgs, levels);
614 
615 	if (cmt_root == NULL)
616 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
617 
618 	/*
619 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
620 	 */
621 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
622 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
623 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
624 
625 	/*
626 	 * Ascendingly sort the PGs in the lineage by number of CPUs
627 	 */
628 	pg_cmt_hier_sort(cpu_cmt_hier, levels);
629 
630 	/*
631 	 * Examine the lineage and validate it.
632 	 * This routine will also try to fix the lineage along with the
633 	 * rest of the PG hierarchy should it detect an issue.
634 	 *
635 	 * If it returns anything other than VALID or REPAIRED, an
636 	 * unrecoverable error has occurred, and we cannot proceed.
637 	 */
638 	lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
639 	if ((lineage_status != CMT_LINEAGE_VALID) &&
640 	    (lineage_status != CMT_LINEAGE_REPAIRED)) {
641 		/*
642 		 * In the case of an unrecoverable error where CMT scheduling
643 		 * has been disabled, assert that the under construction CPU's
644 		 * PG data has an empty CMT load balancing lineage.
645 		 */
646 		ASSERT((cmt_sched_disabled == 0) ||
647 		    (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
648 		return;
649 	}
650 
651 	/*
652 	 * For existing PGs in the lineage, verify that the parent is
653 	 * correct, as the generation in the lineage may have changed
654 	 * as a result of the sorting. Start the traversal at the top
655 	 * of the lineage, moving down.
656 	 */
657 	for (level = levels - 1; level >= 0; ) {
658 		int reorg;
659 
660 		reorg = 0;
661 		pg = cpu_cmt_hier[level];
662 
663 		/*
664 		 * Promote PGs at an incorrect generation into place.
665 		 */
666 		while (pg->cmt_parent &&
667 		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
668 			cmt_hier_promote(pg, pgdata);
669 			reorg++;
670 		}
671 		if (reorg > 0)
672 			level = levels - 1;
673 		else
674 			level--;
675 	}
676 
677 	/*
678 	 * For each of the PGs in the CPU's lineage:
679 	 *	- Add an entry in the CPU sorted CMT PG group
680 	 *	  which is used for top down CMT load balancing
681 	 *	- Tie the PG into the CMT hierarchy by connecting
682 	 *	  it to it's parent and siblings.
683 	 */
684 	for (level = 0; level < levels; level++) {
685 		uint_t		children;
686 		int		err;
687 
688 		pg = cpu_cmt_hier[level];
689 		err = group_add_at(cmt_pgs, pg, levels - level - 1);
690 		ASSERT(err == 0);
691 
692 		if (level == 0)
693 			pgdata->cmt_lineage = (pg_t *)pg;
694 
695 		if (pg->cmt_siblings != NULL) {
696 			/* Already initialized */
697 			ASSERT(pg->cmt_parent == NULL ||
698 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
699 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
700 			    ((pg->cmt_parent != NULL) &&
701 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
702 			continue;
703 		}
704 
705 		if ((level + 1) == levels) {
706 			pg->cmt_parent = NULL;
707 
708 			pg->cmt_siblings = &lgrp->cl_pgs;
709 			children = ++lgrp->cl_npgs;
710 			if (cmt_root != lgrp)
711 				cmt_root->cl_npgs++;
712 		} else {
713 			pg->cmt_parent = cpu_cmt_hier[level + 1];
714 
715 			/*
716 			 * A good parent keeps track of their children.
717 			 * The parent's children group is also the PG's
718 			 * siblings.
719 			 */
720 			if (pg->cmt_parent->cmt_children == NULL) {
721 				pg->cmt_parent->cmt_children =
722 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
723 				group_create(pg->cmt_parent->cmt_children);
724 			}
725 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
726 			children = ++pg->cmt_parent->cmt_nchildren;
727 		}
728 
729 		group_expand(pg->cmt_siblings, children);
730 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
731 	}
732 
733 	/*
734 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
735 	 * for fast lookups later.
736 	 */
737 	if (cp->cpu_physid) {
738 		cp->cpu_physid->cpu_chipid =
739 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
740 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
741 
742 		/*
743 		 * If this cpu has a PG representing shared cache, then set
744 		 * cpu_cacheid to that PG's logical id
745 		 */
746 		if (pg_cache)
747 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
748 	}
749 
750 	/* CPU0 only initialization */
751 	if (is_cpu0) {
752 		is_cpu0 = 0;
753 		cpu0_lgrp = lgrp;
754 	}
755 
756 }
757 
758 /*
759  * Class callback when a CPU is leaving the system (deletion)
760  *
761  * "pgdata" is a reference to the CPU's PG data to be deconstructed.
762  *
763  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
764  * references a "bootstrap" structure across this function's invocation.
765  * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
766  * on the "pgdata" argument, and not cp->cpu_pg.
767  */
768 static void
769 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
770 {
771 	group_iter_t	i;
772 	pg_cmt_t	*pg;
773 	group_t		*pgs, *cmt_pgs;
774 	lgrp_handle_t	lgrp_handle;
775 	cmt_lgrp_t	*lgrp;
776 
777 	if (cmt_sched_disabled)
778 		return;
779 
780 	ASSERT(pg_cpu_is_bootstrapped(cp));
781 
782 	pgs = &pgdata->pgs;
783 	cmt_pgs = &pgdata->cmt_pgs;
784 
785 	/*
786 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
787 	 */
788 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
789 
790 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
791 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
792 		/*
793 		 * One might wonder how we could be deconfiguring the
794 		 * only CPU in the system.
795 		 *
796 		 * On Starcat systems when null_proc_lpa is detected,
797 		 * the boot CPU (which is already configured into a leaf
798 		 * lgroup), is moved into the root lgroup. This is done by
799 		 * deconfiguring it from both lgroups and processor
800 		 * groups), and then later reconfiguring it back in.  This
801 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
802 		 *
803 		 * This special case is detected by noting that the platform
804 		 * has changed the CPU's lgrp affiliation (since it now
805 		 * belongs in the root). In this case, use the cmt_lgrp_t
806 		 * cached for the boot CPU, since this is what needs to be
807 		 * torn down.
808 		 */
809 		lgrp = cpu0_lgrp;
810 	}
811 
812 	ASSERT(lgrp != NULL);
813 
814 	/*
815 	 * First, clean up anything load balancing specific for each of
816 	 * the CPU's PGs that participated in CMT load balancing
817 	 */
818 	pg = (pg_cmt_t *)pgdata->cmt_lineage;
819 	while (pg != NULL) {
820 
821 		((pghw_t *)pg)->pghw_generation++;
822 
823 		/*
824 		 * Remove the PG from the CPU's load balancing lineage
825 		 */
826 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
827 
828 		/*
829 		 * If it's about to become empty, destroy it's children
830 		 * group, and remove it's reference from it's siblings.
831 		 * This is done here (rather than below) to avoid removing
832 		 * our reference from a PG that we just eliminated.
833 		 */
834 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
835 			if (pg->cmt_children != NULL)
836 				group_destroy(pg->cmt_children);
837 			if (pg->cmt_siblings != NULL) {
838 				if (pg->cmt_siblings == &lgrp->cl_pgs)
839 					lgrp->cl_npgs--;
840 				else
841 					pg->cmt_parent->cmt_nchildren--;
842 			}
843 		}
844 		pg = pg->cmt_parent;
845 	}
846 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
847 
848 	/*
849 	 * Now that the load balancing lineage updates have happened,
850 	 * remove the CPU from all it's PGs (destroying any that become
851 	 * empty).
852 	 */
853 	group_iter_init(&i);
854 	while ((pg = group_iterate(pgs, &i)) != NULL) {
855 		if (IS_CMT_PG(pg) == 0)
856 			continue;
857 
858 		pg_cpu_delete((pg_t *)pg, cp, pgdata);
859 		/*
860 		 * Deleting the CPU from the PG changes the CPU's
861 		 * PG group over which we are actively iterating
862 		 * Re-initialize the iteration
863 		 */
864 		group_iter_init(&i);
865 
866 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
867 
868 			/*
869 			 * The PG has become zero sized, so destroy it.
870 			 */
871 			group_destroy(&pg->cmt_cpus_actv);
872 			bitset_fini(&pg->cmt_cpus_actv_set);
873 			pghw_fini((pghw_t *)pg);
874 
875 			pg_destroy((pg_t *)pg);
876 		}
877 	}
878 }
879 
880 /*
881  * Class callback when a CPU is entering a cpu partition
882  */
883 static void
884 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
885 {
886 	group_t		*pgs;
887 	pg_t		*pg;
888 	group_iter_t	i;
889 
890 	ASSERT(MUTEX_HELD(&cpu_lock));
891 
892 	if (cmt_sched_disabled)
893 		return;
894 
895 	pgs = &cp->cpu_pg->pgs;
896 
897 	/*
898 	 * Ensure that the new partition's PG bitset
899 	 * is large enough for all CMT PG's to which cp
900 	 * belongs
901 	 */
902 	group_iter_init(&i);
903 	while ((pg = group_iterate(pgs, &i)) != NULL) {
904 		if (IS_CMT_PG(pg) == 0)
905 			continue;
906 
907 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
908 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
909 	}
910 }
911 
912 /*
913  * Class callback when a CPU is actually moving partitions
914  */
915 static void
916 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
917 {
918 	cpu_t		*cpp;
919 	group_t		*pgs;
920 	pg_t		*pg;
921 	group_iter_t	pg_iter;
922 	pg_cpu_itr_t	cpu_iter;
923 	boolean_t	found;
924 
925 	ASSERT(MUTEX_HELD(&cpu_lock));
926 
927 	if (cmt_sched_disabled)
928 		return;
929 
930 	pgs = &cp->cpu_pg->pgs;
931 	group_iter_init(&pg_iter);
932 
933 	/*
934 	 * Iterate over the CPUs CMT PGs
935 	 */
936 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
937 
938 		if (IS_CMT_PG(pg) == 0)
939 			continue;
940 
941 		/*
942 		 * Add the PG to the bitset in the new partition.
943 		 */
944 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
945 
946 		/*
947 		 * Remove the PG from the bitset in the old partition
948 		 * if the last of the PG's CPUs have left.
949 		 */
950 		found = B_FALSE;
951 		PG_CPU_ITR_INIT(pg, cpu_iter);
952 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
953 			if (cpp == cp)
954 				continue;
955 			if (CPU_ACTIVE(cpp) &&
956 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
957 				found = B_TRUE;
958 				break;
959 			}
960 		}
961 		if (!found)
962 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
963 	}
964 }
965 
966 /*
967  * Class callback when a CPU becomes active (online)
968  *
969  * This is called in a context where CPUs are paused
970  */
971 static void
972 pg_cmt_cpu_active(cpu_t *cp)
973 {
974 	int		err;
975 	group_iter_t	i;
976 	pg_cmt_t	*pg;
977 	group_t		*pgs;
978 
979 	ASSERT(MUTEX_HELD(&cpu_lock));
980 
981 	if (cmt_sched_disabled)
982 		return;
983 
984 	pgs = &cp->cpu_pg->pgs;
985 	group_iter_init(&i);
986 
987 	/*
988 	 * Iterate over the CPU's PGs
989 	 */
990 	while ((pg = group_iterate(pgs, &i)) != NULL) {
991 
992 		if (IS_CMT_PG(pg) == 0)
993 			continue;
994 
995 		/*
996 		 * Move to the next generation since topology is changing
997 		 */
998 		((pghw_t *)pg)->pghw_generation++;
999 
1000 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1001 		ASSERT(err == 0);
1002 
1003 		/*
1004 		 * If this is the first active CPU in the PG, and it
1005 		 * represents a hardware sharing relationship over which
1006 		 * CMT load balancing is performed, add it as a candidate
1007 		 * for balancing with it's siblings.
1008 		 */
1009 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
1010 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1011 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
1012 			ASSERT(err == 0);
1013 
1014 			/*
1015 			 * If this is a top level PG, add it as a balancing
1016 			 * candidate when balancing within the root lgroup.
1017 			 */
1018 			if (pg->cmt_parent == NULL &&
1019 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1020 				err = group_add(&cmt_root->cl_pgs, pg,
1021 				    GRP_NORESIZE);
1022 				ASSERT(err == 0);
1023 			}
1024 		}
1025 
1026 		/*
1027 		 * Notate the CPU in the PGs active CPU bitset.
1028 		 * Also notate the PG as being active in it's associated
1029 		 * partition
1030 		 */
1031 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1032 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
1033 	}
1034 }
1035 
1036 /*
1037  * Class callback when a CPU goes inactive (offline)
1038  *
1039  * This is called in a context where CPUs are paused
1040  */
1041 static void
1042 pg_cmt_cpu_inactive(cpu_t *cp)
1043 {
1044 	int		err;
1045 	group_t		*pgs;
1046 	pg_cmt_t	*pg;
1047 	cpu_t		*cpp;
1048 	group_iter_t	i;
1049 	pg_cpu_itr_t	cpu_itr;
1050 	boolean_t	found;
1051 
1052 	ASSERT(MUTEX_HELD(&cpu_lock));
1053 
1054 	if (cmt_sched_disabled)
1055 		return;
1056 
1057 	pgs = &cp->cpu_pg->pgs;
1058 	group_iter_init(&i);
1059 
1060 	while ((pg = group_iterate(pgs, &i)) != NULL) {
1061 
1062 		if (IS_CMT_PG(pg) == 0)
1063 			continue;
1064 
1065 		/*
1066 		 * Move to the next generation since topology is changing
1067 		 */
1068 		((pghw_t *)pg)->pghw_generation++;
1069 
1070 		/*
1071 		 * Remove the CPU from the CMT PGs active CPU group
1072 		 * bitmap
1073 		 */
1074 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1075 		ASSERT(err == 0);
1076 
1077 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1078 
1079 		/*
1080 		 * If there are no more active CPUs in this PG over which
1081 		 * load was balanced, remove it as a balancing candidate.
1082 		 */
1083 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
1084 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1085 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1086 			ASSERT(err == 0);
1087 
1088 			if (pg->cmt_parent == NULL &&
1089 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1090 				err = group_remove(&cmt_root->cl_pgs, pg,
1091 				    GRP_NORESIZE);
1092 				ASSERT(err == 0);
1093 			}
1094 		}
1095 
1096 		/*
1097 		 * Assert the number of active CPUs does not exceed
1098 		 * the total number of CPUs in the PG
1099 		 */
1100 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1101 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1102 
1103 		/*
1104 		 * Update the PG bitset in the CPU's old partition
1105 		 */
1106 		found = B_FALSE;
1107 		PG_CPU_ITR_INIT(pg, cpu_itr);
1108 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1109 			if (cpp == cp)
1110 				continue;
1111 			if (CPU_ACTIVE(cpp) &&
1112 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1113 				found = B_TRUE;
1114 				break;
1115 			}
1116 		}
1117 		if (!found) {
1118 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
1119 			    ((pg_t *)pg)->pg_id);
1120 		}
1121 	}
1122 }
1123 
1124 /*
1125  * Return non-zero if the CPU belongs in the given PG
1126  */
1127 static int
1128 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1129 {
1130 	cpu_t	*pg_cpu;
1131 
1132 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1133 
1134 	ASSERT(pg_cpu != NULL);
1135 
1136 	/*
1137 	 * The CPU belongs if, given the nature of the hardware sharing
1138 	 * relationship represented by the PG, the CPU has that
1139 	 * relationship with some other CPU already in the PG
1140 	 */
1141 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1142 		return (1);
1143 
1144 	return (0);
1145 }
1146 
1147 /*
1148  * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1149  */
1150 static void
1151 pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1152 {
1153 	int		i, j, inc, sz;
1154 	int		start, end;
1155 	pg_t		*tmp;
1156 	pg_t		**h = (pg_t **)hier;
1157 
1158 	/*
1159 	 * First sort by number of CPUs
1160 	 */
1161 	inc = size / 2;
1162 	while (inc > 0) {
1163 		for (i = inc; i < size; i++) {
1164 			j = i;
1165 			tmp = h[i];
1166 			while ((j >= inc) &&
1167 			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
1168 				h[j] = h[j - inc];
1169 				j = j - inc;
1170 			}
1171 			h[j] = tmp;
1172 		}
1173 		if (inc == 2)
1174 			inc = 1;
1175 		else
1176 			inc = (inc * 5) / 11;
1177 	}
1178 
1179 	/*
1180 	 * Break ties by asking the platform.
1181 	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
1182 	 */
1183 	for (start = 0; start < size; start++) {
1184 
1185 		/*
1186 		 * Find various contiguous sets of elements,
1187 		 * in the array, with the same number of cpus
1188 		 */
1189 		end = start;
1190 		sz = PG_NUM_CPUS(h[start]);
1191 		while ((end < size) && (sz == PG_NUM_CPUS(h[end])))
1192 			end++;
1193 		/*
1194 		 * Sort each such set of the array by rank
1195 		 */
1196 		for (i = start + 1; i < end; i++) {
1197 			j = i - 1;
1198 			tmp = h[i];
1199 			while (j >= start &&
1200 			    pg_cmt_hier_rank(hier[j],
1201 			    (pg_cmt_t *)tmp) == hier[j]) {
1202 				h[j + 1] = h[j];
1203 				j--;
1204 			}
1205 			h[j + 1] = tmp;
1206 		}
1207 	}
1208 }
1209 
1210 /*
1211  * Return a cmt_lgrp_t * given an lgroup handle.
1212  */
1213 static cmt_lgrp_t *
1214 pg_cmt_find_lgrp(lgrp_handle_t hand)
1215 {
1216 	cmt_lgrp_t	*lgrp;
1217 
1218 	ASSERT(MUTEX_HELD(&cpu_lock));
1219 
1220 	lgrp = cmt_lgrps;
1221 	while (lgrp != NULL) {
1222 		if (lgrp->cl_hand == hand)
1223 			break;
1224 		lgrp = lgrp->cl_next;
1225 	}
1226 	return (lgrp);
1227 }
1228 
1229 /*
1230  * Create a cmt_lgrp_t with the specified handle.
1231  */
1232 static cmt_lgrp_t *
1233 pg_cmt_lgrp_create(lgrp_handle_t hand)
1234 {
1235 	cmt_lgrp_t	*lgrp;
1236 
1237 	ASSERT(MUTEX_HELD(&cpu_lock));
1238 
1239 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1240 
1241 	lgrp->cl_hand = hand;
1242 	lgrp->cl_npgs = 0;
1243 	lgrp->cl_next = cmt_lgrps;
1244 	cmt_lgrps = lgrp;
1245 	group_create(&lgrp->cl_pgs);
1246 
1247 	return (lgrp);
1248 }
1249 
1250 /*
1251  * Interfaces to enable and disable power aware dispatching
1252  * The caller must be holding cpu_lock.
1253  *
1254  * Return 0 on success and -1 on failure.
1255  */
1256 int
1257 cmt_pad_enable(pghw_type_t type)
1258 {
1259 	group_t		*hwset;
1260 	group_iter_t	iter;
1261 	pg_cmt_t	*pg;
1262 
1263 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1264 	ASSERT(MUTEX_HELD(&cpu_lock));
1265 
1266 	if (cmt_sched_disabled == 1)
1267 		return (-1);
1268 
1269 	if ((hwset = pghw_set_lookup(type)) == NULL ||
1270 	    cmt_hw_blacklisted[type]) {
1271 		/*
1272 		 * Unable to find any instances of the specified type
1273 		 * of power domain, or the power domains have been blacklisted.
1274 		 */
1275 		return (-1);
1276 	}
1277 
1278 	/*
1279 	 * Iterate over the power domains, setting the default dispatcher
1280 	 * policy for power/performance optimization.
1281 	 *
1282 	 * Simply setting the policy isn't enough in the case where the power
1283 	 * domain is an only child of another PG. Because the dispatcher walks
1284 	 * the PG hierarchy in a top down fashion, the higher up PG's policy
1285 	 * will dominate. So promote the power domain above it's parent if both
1286 	 * PG and it's parent have the same CPUs to ensure it's policy
1287 	 * dominates.
1288 	 */
1289 	group_iter_init(&iter);
1290 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1291 		/*
1292 		 * If the power domain is an only child to a parent
1293 		 * not implementing the same policy, promote the child
1294 		 * above the parent to activate the policy.
1295 		 */
1296 		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
1297 		while ((pg->cmt_parent != NULL) &&
1298 		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
1299 		    (PG_NUM_CPUS((pg_t *)pg) ==
1300 		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
1301 			cmt_hier_promote(pg, NULL);
1302 		}
1303 	}
1304 
1305 	return (0);
1306 }
1307 
1308 int
1309 cmt_pad_disable(pghw_type_t type)
1310 {
1311 	group_t		*hwset;
1312 	group_iter_t	iter;
1313 	pg_cmt_t	*pg;
1314 	pg_cmt_t	*child;
1315 
1316 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1317 	ASSERT(MUTEX_HELD(&cpu_lock));
1318 
1319 	if (cmt_sched_disabled == 1)
1320 		return (-1);
1321 
1322 	if ((hwset = pghw_set_lookup(type)) == NULL) {
1323 		/*
1324 		 * Unable to find any instances of the specified type of
1325 		 * power domain.
1326 		 */
1327 		return (-1);
1328 	}
1329 	/*
1330 	 * Iterate over the power domains, setting the default dispatcher
1331 	 * policy for performance optimization (load balancing).
1332 	 */
1333 	group_iter_init(&iter);
1334 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1335 
1336 		/*
1337 		 * If the power domain has an only child that implements
1338 		 * policy other than load balancing, promote the child
1339 		 * above the power domain to ensure it's policy dominates.
1340 		 */
1341 		if (pg->cmt_children != NULL &&
1342 		    GROUP_SIZE(pg->cmt_children) == 1) {
1343 			child = GROUP_ACCESS(pg->cmt_children, 0);
1344 			if ((child->cmt_policy & CMT_BALANCE) == 0) {
1345 				cmt_hier_promote(child, NULL);
1346 			}
1347 		}
1348 		pg->cmt_policy = CMT_BALANCE;
1349 	}
1350 	return (0);
1351 }
1352 
1353 /* ARGSUSED */
1354 static void
1355 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1356 		    kthread_t *new)
1357 {
1358 	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;
1359 
1360 	if (old == cp->cpu_idle_thread) {
1361 		atomic_inc_32(&cmt_pg->cmt_utilization);
1362 	} else if (new == cp->cpu_idle_thread) {
1363 		atomic_dec_32(&cmt_pg->cmt_utilization);
1364 	}
1365 }
1366 
1367 /*
1368  * Macro to test whether a thread is currently runnable on a CPU in a PG.
1369  */
1370 #define	THREAD_RUNNABLE_IN_PG(t, pg)					\
1371 	((t)->t_state == TS_RUN &&					\
1372 	    (t)->t_disp_queue->disp_cpu &&				\
1373 	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
1374 	    (t)->t_disp_queue->disp_cpu->cpu_seqid))
1375 
1376 static void
1377 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1378     kthread_t *new)
1379 {
1380 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1381 	cpupm_domain_t	*dom;
1382 	uint32_t	u;
1383 
1384 	if (old == cp->cpu_idle_thread) {
1385 		ASSERT(new != cp->cpu_idle_thread);
1386 		u = atomic_inc_32_nv(&cmt->cmt_utilization);
1387 		if (u == 1) {
1388 			/*
1389 			 * Notify the CPU power manager that the domain
1390 			 * is non-idle.
1391 			 */
1392 			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1393 			cpupm_utilization_event(cp, now, dom,
1394 			    CPUPM_DOM_BUSY_FROM_IDLE);
1395 		}
1396 	} else if (new == cp->cpu_idle_thread) {
1397 		ASSERT(old != cp->cpu_idle_thread);
1398 		u = atomic_dec_32_nv(&cmt->cmt_utilization);
1399 		if (u == 0) {
1400 			/*
1401 			 * The domain is idle, notify the CPU power
1402 			 * manager.
1403 			 *
1404 			 * Avoid notifying if the thread is simply migrating
1405 			 * between CPUs in the domain.
1406 			 */
1407 			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
1408 				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1409 				cpupm_utilization_event(cp, now, dom,
1410 				    CPUPM_DOM_IDLE_FROM_BUSY);
1411 			}
1412 		}
1413 	}
1414 }
1415 
1416 /* ARGSUSED */
1417 static void
1418 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
1419 {
1420 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1421 	cpupm_domain_t	*dom;
1422 
1423 	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1424 	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
1425 }
1426 
1427 /*
1428  * Return the name of the CMT scheduling policy
1429  * being implemented across this PG
1430  */
1431 static char *
1432 pg_cmt_policy_name(pg_t *pg)
1433 {
1434 	pg_cmt_policy_t policy;
1435 
1436 	policy = ((pg_cmt_t *)pg)->cmt_policy;
1437 
1438 	if (policy & CMT_AFFINITY) {
1439 		if (policy & CMT_BALANCE)
1440 			return ("Load Balancing & Affinity");
1441 		else if (policy & CMT_COALESCE)
1442 			return ("Load Coalescence & Affinity");
1443 		else
1444 			return ("Affinity");
1445 	} else {
1446 		if (policy & CMT_BALANCE)
1447 			return ("Load Balancing");
1448 		else if (policy & CMT_COALESCE)
1449 			return ("Load Coalescence");
1450 		else
1451 			return ("None");
1452 	}
1453 }
1454 
1455 /*
1456  * Prune PG, and all other instances of PG's hardware sharing relationship
1457  * from the CMT PG hierarchy.
1458  *
1459  * This routine operates on the CPU specific processor group data (for the CPUs
1460  * in the PG being pruned), and may be invoked from a context where one CPU's
1461  * PG data is under construction. In this case the argument "pgdata", if not
1462  * NULL, is a reference to the CPU's under-construction PG data.
1463  */
1464 static int
1465 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1466 {
1467 	group_t		*hwset, *children;
1468 	int		i, j, r, size = *sz;
1469 	group_iter_t	hw_iter, child_iter;
1470 	pg_cpu_itr_t	cpu_iter;
1471 	pg_cmt_t	*pg, *child;
1472 	cpu_t		*cpu;
1473 	int		cap_needed;
1474 	pghw_type_t	hw;
1475 
1476 	ASSERT(MUTEX_HELD(&cpu_lock));
1477 
1478 	/*
1479 	 * Inform pghw layer that this PG is pruned.
1480 	 */
1481 	pghw_cmt_fini((pghw_t *)pg_bad);
1482 
1483 	hw = ((pghw_t *)pg_bad)->pghw_hw;
1484 
1485 	if (hw == PGHW_POW_ACTIVE) {
1486 		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
1487 		    "Event Based CPUPM Unavailable");
1488 	} else if (hw == PGHW_POW_IDLE) {
1489 		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
1490 		    "Dispatcher assisted CPUPM disabled.");
1491 	}
1492 
1493 	/*
1494 	 * Find and eliminate the PG from the lineage.
1495 	 */
1496 	for (i = 0; i < size; i++) {
1497 		if (lineage[i] == pg_bad) {
1498 			for (j = i; j < size - 1; j++)
1499 				lineage[j] = lineage[j + 1];
1500 			*sz = size - 1;
1501 			break;
1502 		}
1503 	}
1504 
1505 	/*
1506 	 * We'll prune all instances of the hardware sharing relationship
1507 	 * represented by pg. But before we do that (and pause CPUs) we need
1508 	 * to ensure the hierarchy's groups are properly sized.
1509 	 */
1510 	hwset = pghw_set_lookup(hw);
1511 
1512 	/*
1513 	 * Blacklist the hardware so future processor groups of this type won't
1514 	 * participate in CMT thread placement.
1515 	 *
1516 	 * XXX
1517 	 * For heterogeneous system configurations, this might be overkill.
1518 	 * We may only need to blacklist the illegal PGs, and other instances
1519 	 * of this hardware sharing relationship may be ok.
1520 	 */
1521 	cmt_hw_blacklisted[hw] = 1;
1522 
1523 	/*
1524 	 * For each of the PGs being pruned, ensure sufficient capacity in
1525 	 * the siblings set for the PG's children
1526 	 */
1527 	group_iter_init(&hw_iter);
1528 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1529 		/*
1530 		 * PG is being pruned, but if it is bringing up more than
1531 		 * one child, ask for more capacity in the siblings group.
1532 		 */
1533 		cap_needed = 0;
1534 		if (pg->cmt_children &&
1535 		    GROUP_SIZE(pg->cmt_children) > 1) {
1536 			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
1537 
1538 			group_expand(pg->cmt_siblings,
1539 			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);
1540 
1541 			/*
1542 			 * If this is a top level group, also ensure the
1543 			 * capacity in the root lgrp level CMT grouping.
1544 			 */
1545 			if (pg->cmt_parent == NULL &&
1546 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1547 				group_expand(&cmt_root->cl_pgs,
1548 				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1549 				cmt_root->cl_npgs += cap_needed;
1550 			}
1551 		}
1552 	}
1553 
1554 	/*
1555 	 * We're operating on the PG hierarchy. Pause CPUs to ensure
1556 	 * exclusivity with respect to the dispatcher.
1557 	 */
1558 	pause_cpus(NULL, NULL);
1559 
1560 	/*
1561 	 * Prune all PG instances of the hardware sharing relationship
1562 	 * represented by pg.
1563 	 */
1564 	group_iter_init(&hw_iter);
1565 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1566 
1567 		/*
1568 		 * Remove PG from it's group of siblings, if it's there.
1569 		 */
1570 		if (pg->cmt_siblings) {
1571 			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1572 		}
1573 		if (pg->cmt_parent == NULL &&
1574 		    pg->cmt_siblings != &cmt_root->cl_pgs) {
1575 			(void) group_remove(&cmt_root->cl_pgs, pg,
1576 			    GRP_NORESIZE);
1577 		}
1578 
1579 		/*
1580 		 * Indicate that no CMT policy will be implemented across
1581 		 * this PG.
1582 		 */
1583 		pg->cmt_policy = CMT_NO_POLICY;
1584 
1585 		/*
1586 		 * Move PG's children from it's children set to it's parent's
1587 		 * children set. Note that the parent's children set, and PG's
1588 		 * siblings set are the same thing.
1589 		 *
1590 		 * Because we are iterating over the same group that we are
1591 		 * operating on (removing the children), first add all of PG's
1592 		 * children to the parent's children set, and once we are done
1593 		 * iterating, empty PG's children set.
1594 		 */
1595 		if (pg->cmt_children != NULL) {
1596 			children = pg->cmt_children;
1597 
1598 			group_iter_init(&child_iter);
1599 			while ((child = group_iterate(children, &child_iter))
1600 			    != NULL) {
1601 				if (pg->cmt_siblings != NULL) {
1602 					r = group_add(pg->cmt_siblings, child,
1603 					    GRP_NORESIZE);
1604 					ASSERT(r == 0);
1605 
1606 					if (pg->cmt_parent == NULL &&
1607 					    pg->cmt_siblings !=
1608 					    &cmt_root->cl_pgs) {
1609 						r = group_add(&cmt_root->cl_pgs,
1610 						    child, GRP_NORESIZE);
1611 						ASSERT(r == 0);
1612 					}
1613 				}
1614 			}
1615 			group_empty(pg->cmt_children);
1616 		}
1617 
1618 		/*
1619 		 * Reset the callbacks to the defaults
1620 		 */
1621 		pg_callback_set_defaults((pg_t *)pg);
1622 
1623 		/*
1624 		 * Update all the CPU lineages in each of PG's CPUs
1625 		 */
1626 		PG_CPU_ITR_INIT(pg, cpu_iter);
1627 		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1628 			pg_cmt_t	*cpu_pg;
1629 			group_iter_t	liter;	/* Iterator for the lineage */
1630 			cpu_pg_t	*cpd;	/* CPU's PG data */
1631 
1632 			/*
1633 			 * The CPU's lineage is under construction still
1634 			 * references the bootstrap CPU PG data structure.
1635 			 */
1636 			if (pg_cpu_is_bootstrapped(cpu))
1637 				cpd = pgdata;
1638 			else
1639 				cpd = cpu->cpu_pg;
1640 
1641 			/*
1642 			 * Iterate over the CPU's PGs updating the children
1643 			 * of the PG being promoted, since they have a new
1644 			 * parent and siblings set.
1645 			 */
1646 			group_iter_init(&liter);
1647 			while ((cpu_pg = group_iterate(&cpd->pgs,
1648 			    &liter)) != NULL) {
1649 				if (cpu_pg->cmt_parent == pg) {
1650 					cpu_pg->cmt_parent = pg->cmt_parent;
1651 					cpu_pg->cmt_siblings = pg->cmt_siblings;
1652 				}
1653 			}
1654 
1655 			/*
1656 			 * Update the CPU's lineages
1657 			 *
1658 			 * Remove the PG from the CPU's group used for CMT
1659 			 * scheduling.
1660 			 */
1661 			(void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
1662 		}
1663 	}
1664 	start_cpus();
1665 	return (0);
1666 }
1667 
1668 /*
1669  * Disable CMT scheduling
1670  */
1671 static void
1672 pg_cmt_disable(void)
1673 {
1674 	cpu_t		*cpu;
1675 
1676 	ASSERT(MUTEX_HELD(&cpu_lock));
1677 
1678 	pause_cpus(NULL, NULL);
1679 	cpu = cpu_list;
1680 
1681 	do {
1682 		if (cpu->cpu_pg)
1683 			group_empty(&cpu->cpu_pg->cmt_pgs);
1684 	} while ((cpu = cpu->cpu_next) != cpu_list);
1685 
1686 	cmt_sched_disabled = 1;
1687 	start_cpus();
1688 	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
1689 }
1690 
1691 /*
1692  * CMT lineage validation
1693  *
1694  * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
1695  * of the PGs in a CPU's lineage. This is necessary because it's possible that
1696  * some groupings (power domain groupings in particular) may be defined by
1697  * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
1698  * possible to integrate those groupings into the CMT PG hierarchy, if doing
1699  * so would violate the subset invariant of the hierarchy, which says that
1700  * a PG must be subset of its parent (if it has one).
1701  *
1702  * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
1703  * would result in a violation of this invariant. If a violation is found,
1704  * and the PG is of a grouping type who's definition is known to originate from
1705  * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
1706  * PG (and all other instances PG's sharing relationship type) from the CMT
1707  * hierarchy. Further, future instances of that sharing relationship type won't
1708  * be added. If the grouping definition doesn't originate from suspect
1709  * sources, then pg_cmt_disable() will be invoked to log an error, and disable
1710  * CMT scheduling altogether.
1711  *
1712  * This routine is invoked after the CPU has been added to the PGs in which
1713  * it belongs, but before those PGs have been added to (or had their place
1714  * adjusted in) the CMT PG hierarchy.
1715  *
1716  * The first argument is the CPUs PG lineage (essentially an array of PGs in
1717  * which the CPU belongs) that has already been sorted in ascending order
1718  * by CPU count. Some of the PGs in the CPUs lineage may already have other
1719  * CPUs in them, and have already been integrated into the CMT hierarchy.
1720  *
1721  * The addition of this new CPU to these pre-existing PGs means that those
1722  * PGs may need to be promoted up in the hierarchy to satisfy the subset
1723  * invariant. In additon to testing the subset invariant for the lineage,
1724  * this routine also verifies that the addition of the new CPU to the
1725  * existing PGs wouldn't cause the subset invariant to be violated in
1726  * the exiting lineages.
1727  *
1728  * This routine will normally return one of the following:
1729  * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
1730  * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
1731  *
1732  * Otherwise, this routine will return a value indicating which error it
1733  * was unable to recover from (and set cmt_lineage_status along the way).
1734  *
1735  * This routine operates on the CPU specific processor group data (for the CPU
1736  * whose lineage is being validated), which is under-construction.
1737  * "pgdata" is a reference to the CPU's under-construction PG data.
1738  * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
1739  */
1740 static cmt_lineage_validation_t
1741 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1742 {
1743 	int		i, j, size;
1744 	pg_cmt_t	*pg, *pg_next, *pg_bad, *pg_tmp, *parent;
1745 	cpu_t		*cp;
1746 	pg_cpu_itr_t	cpu_iter;
1747 	lgrp_handle_t	lgrp;
1748 
1749 	ASSERT(MUTEX_HELD(&cpu_lock));
1750 
1751 revalidate:
1752 	size = *sz;
1753 	pg_bad = NULL;
1754 	lgrp = LGRP_NULL_HANDLE;
1755 	for (i = 0; i < size; i++) {
1756 
1757 		pg = lineage[i];
1758 		if (i < size - 1)
1759 			pg_next = lineage[i + 1];
1760 		else
1761 			pg_next = NULL;
1762 
1763 		/*
1764 		 * We assume that the lineage has already been sorted
1765 		 * by the number of CPUs. In fact, we depend on it.
1766 		 */
1767 		ASSERT(pg_next == NULL ||
1768 		    (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));
1769 
1770 		/*
1771 		 * The CPUs PG lineage was passed as the first argument to
1772 		 * this routine and contains the sorted list of the CPU's
1773 		 * PGs. Ultimately, the ordering of the PGs in that list, and
1774 		 * the ordering as traversed by the cmt_parent list must be
1775 		 * the same. PG promotion will be used as the mechanism to
1776 		 * achieve this, but first we need to look for cases where
1777 		 * promotion will be necessary, and validate that will be
1778 		 * possible without violating the subset invarient described
1779 		 * above.
1780 		 *
1781 		 * Since the PG topology is in the middle of being changed, we
1782 		 * need to check whether the PG's existing parent (if any) is
1783 		 * part of this CPU's lineage (and therefore should contain
1784 		 * the new CPU). If not, it means that the addition of the
1785 		 * new CPU should have made this PG have more CPUs than its
1786 		 * parent (and other ancestors not in the same lineage) and
1787 		 * will need to be promoted into place.
1788 		 *
1789 		 * We need to verify all of this to defend against a buggy
1790 		 * BIOS giving bad power domain CPU groupings. Sigh.
1791 		 */
1792 		parent = pg->cmt_parent;
1793 		while (parent != NULL) {
1794 			/*
1795 			 * Determine if the parent/ancestor is in this lineage
1796 			 */
1797 			pg_tmp = NULL;
1798 			for (j = 0; (j < size) && (pg_tmp != parent); j++) {
1799 				pg_tmp = lineage[j];
1800 			}
1801 			if (pg_tmp == parent) {
1802 				/*
1803 				 * It's in the lineage. The concentricity
1804 				 * checks will handle the rest.
1805 				 */
1806 				break;
1807 			}
1808 			/*
1809 			 * If it is not in the lineage, PG will eventually
1810 			 * need to be promoted above it. Verify the ancestor
1811 			 * is a proper subset. There is still an error if
1812 			 * the ancestor has the same number of CPUs as PG,
1813 			 * since that would imply it should be in the lineage,
1814 			 * and we already know it isn't.
1815 			 */
1816 			if (PG_NUM_CPUS((pg_t *)parent) >=
1817 			    PG_NUM_CPUS((pg_t *)pg)) {
1818 				/*
1819 				 * Not a proper subset if the parent/ancestor
1820 				 * has the same or more CPUs than PG.
1821 				 */
1822 				cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE;
1823 				goto handle_error;
1824 			}
1825 			parent = parent->cmt_parent;
1826 		}
1827 
1828 		/*
1829 		 * Walk each of the CPUs in the PGs group and perform
1830 		 * consistency checks along the way.
1831 		 */
1832 		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
1833 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1834 			/*
1835 			 * Verify that there aren't any CPUs contained in PG
1836 			 * that the next PG in the lineage (which is larger
1837 			 * or same size) doesn't also contain.
1838 			 */
1839 			if (pg_next != NULL &&
1840 			    pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
1841 				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
1842 				goto handle_error;
1843 			}
1844 
1845 			/*
1846 			 * Verify that all the CPUs in the PG are in the same
1847 			 * lgroup.
1848 			 */
1849 			if (lgrp == LGRP_NULL_HANDLE) {
1850 				lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
1851 			} else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
1852 				cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
1853 				goto handle_error;
1854 			}
1855 		}
1856 	}
1857 
1858 handle_error:
1859 	/*
1860 	 * Some of these validation errors can result when the CPU grouping
1861 	 * information is derived from buggy sources (for example, incorrect
1862 	 * ACPI tables on x86 systems).
1863 	 *
1864 	 * We'll try to recover in such cases by pruning out the illegal
1865 	 * groupings from the PG hierarchy, which means that we won't optimize
1866 	 * for those levels, but we will for the remaining ones.
1867 	 */
1868 	switch (cmt_lineage_status) {
1869 	case CMT_LINEAGE_VALID:
1870 	case CMT_LINEAGE_REPAIRED:
1871 		break;
1872 	case CMT_LINEAGE_PG_SPANS_LGRPS:
1873 		/*
1874 		 * We've detected a PG whose CPUs span lgroups.
1875 		 *
1876 		 * This isn't supported, as the dispatcher isn't allowed to
1877 		 * to do CMT thread placement across lgroups, as this would
1878 		 * conflict with policies implementing MPO thread affinity.
1879 		 *
1880 		 * If the PG is of a sharing relationship type known to
1881 		 * legitimately span lgroups, specify that no CMT thread
1882 		 * placement policy should be implemented, and prune the PG
1883 		 * from the existing CMT PG hierarchy.
1884 		 *
1885 		 * Otherwise, fall though to the case below for handling.
1886 		 */
1887 		if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) {
1888 			if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1889 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1890 				goto revalidate;
1891 			}
1892 		}
1893 		/*LINTED*/
1894 	case CMT_LINEAGE_NON_PROMOTABLE:
1895 		/*
1896 		 * We've detected a PG that already exists in another CPU's
1897 		 * lineage that cannot cannot legally be promoted into place
1898 		 * without breaking the invariants of the hierarchy.
1899 		 */
1900 		if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1901 			if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1902 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1903 				goto revalidate;
1904 			}
1905 		}
1906 		/*
1907 		 * Something went wrong trying to prune out the bad level.
1908 		 * Disable CMT scheduling altogether.
1909 		 */
1910 		pg_cmt_disable();
1911 		break;
1912 	case CMT_LINEAGE_NON_CONCENTRIC:
1913 		/*
1914 		 * We've detected a non-concentric PG lineage, which means that
1915 		 * there's a PG in the lineage that has CPUs that the next PG
1916 		 * over in the lineage (which is the same size or larger)
1917 		 * doesn't have.
1918 		 *
1919 		 * In this case, we examine the two PGs to see if either
1920 		 * grouping is defined by potentially buggy sources.
1921 		 *
1922 		 * If one has less CPUs than the other, and contains CPUs
1923 		 * not found in the parent, and it is an untrusted enumeration,
1924 		 * then prune it. If both have the same number of CPUs, then
1925 		 * prune the one that is untrusted.
1926 		 *
1927 		 * This process repeats until we have a concentric lineage,
1928 		 * or we would have to prune out level derived from what we
1929 		 * thought was a reliable source, in which case CMT scheduling
1930 		 * is disabled altogether.
1931 		 */
1932 		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
1933 		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
1934 			pg_bad = pg;
1935 		} else if (PG_NUM_CPUS((pg_t *)pg) ==
1936 		    PG_NUM_CPUS((pg_t *)pg_next)) {
1937 			if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
1938 				pg_bad = pg_next;
1939 			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1940 				pg_bad = pg;
1941 			}
1942 		}
1943 		if (pg_bad) {
1944 			if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
1945 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1946 				goto revalidate;
1947 			}
1948 		}
1949 		/*
1950 		 * Something went wrong trying to identify and/or prune out
1951 		 * the bad level. Disable CMT scheduling altogether.
1952 		 */
1953 		pg_cmt_disable();
1954 		break;
1955 	default:
1956 		/*
1957 		 * If we're here, we've encountered a validation error for
1958 		 * which we don't know how to recover. In this case, disable
1959 		 * CMT scheduling altogether.
1960 		 */
1961 		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1962 		pg_cmt_disable();
1963 	}
1964 	return (cmt_lineage_status);
1965 }
1966