xref: /titanic_41/usr/src/uts/common/disp/cmt.c (revision a62774df315360f02521d6470eab7d5080137dad)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/thread.h>
30 #include <sys/cpuvar.h>
31 #include <sys/cpupart.h>
32 #include <sys/kmem.h>
33 #include <sys/cmn_err.h>
34 #include <sys/kstat.h>
35 #include <sys/processor.h>
36 #include <sys/disp.h>
37 #include <sys/group.h>
38 #include <sys/pghw.h>
39 #include <sys/bitset.h>
40 #include <sys/lgrp.h>
41 #include <sys/cmt.h>
42 #include <sys/cpu_pm.h>
43 
44 /*
45  * CMT scheduler / dispatcher support
46  *
47  * This file implements CMT scheduler support using Processor Groups.
48  * The CMT processor group class creates and maintains the CMT class
49  * specific processor group pg_cmt_t.
50  *
51  * ---------------------------- <-- pg_cmt_t *
52  * | pghw_t                   |
53  * ----------------------------
54  * | CMT class specific data  |
55  * | - hierarchy linkage      |
56  * | - CMT load balancing data|
57  * | - active CPU group/bitset|
58  * ----------------------------
59  *
60  * The scheduler/dispatcher leverages knowledge of the performance
61  * relevant CMT sharing relationships existing between cpus to implement
62  * optimized affinity, load balancing, and coalescence policies.
63  *
64  * Load balancing policy seeks to improve performance by minimizing
65  * contention over shared processor resources / facilities, Affinity
66  * policies seek to improve cache and TLB utilization. Coalescence
67  * policies improve resource utilization and ultimately power efficiency.
68  *
69  * The CMT PGs created by this class are already arranged into a
70  * hierarchy (which is done in the pghw layer). To implement the top-down
71  * CMT load balancing algorithm, the CMT PGs additionally maintain
72  * parent, child and sibling hierarchy relationships.
73  * Parent PGs always contain a superset of their children(s) resources,
74  * each PG can have at most one parent, and siblings are the group of PGs
75  * sharing the same parent.
76  *
77  * On UMA based systems, the CMT load balancing algorithm begins by balancing
78  * load across the group of top level PGs in the system hierarchy.
79  * On NUMA systems, the CMT load balancing algorithm balances load across the
80  * group of top level PGs in each leaf lgroup...but for root homed threads,
81  * is willing to balance against all the top level PGs in the system.
82  *
83  * Groups of top level PGs are maintained to implement the above, one for each
84  * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
85  * root lgroup) that contains all the top level PGs in the system.
86  */
87 static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
88 static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
89 						/* used for null_proc_lpa */
90 cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */
91 
92 static int		is_cpu0 = 1; /* true if this is boot CPU context */
93 
94 /*
95  * Array of hardware sharing relationships that are blacklisted.
96  * CMT scheduling optimizations won't be performed for blacklisted sharing
97  * relationships.
98  */
99 static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
100 
101 /*
102  * Set this to non-zero to disable CMT scheduling
103  * This must be done via kmdb -d, as /etc/system will be too late
104  */
105 int			cmt_sched_disabled = 0;
106 
107 /*
108  * Status codes for CMT lineage validation
109  * See pg_cmt_lineage_validate() below
110  */
111 typedef enum cmt_lineage_validation {
112 	CMT_LINEAGE_VALID,
113 	CMT_LINEAGE_NON_CONCENTRIC,
114 	CMT_LINEAGE_PG_SPANS_LGRPS,
115 	CMT_LINEAGE_NON_PROMOTABLE,
116 	CMT_LINEAGE_REPAIRED,
117 	CMT_LINEAGE_UNRECOVERABLE
118 } cmt_lineage_validation_t;
119 
120 /*
121  * Status of the current lineage under construction.
122  * One must be holding cpu_lock to change this.
123  */
124 cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;
125 
126 /*
127  * Power domain definitions (on x86) are defined by ACPI, and
128  * therefore may be subject to BIOS bugs.
129  */
130 #define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)
131 
132 /*
133  * Macro to test if PG is managed by the CMT PG class
134  */
135 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
136 
137 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
138 
139 static pg_t		*pg_cmt_alloc();
140 static void		pg_cmt_free(pg_t *);
141 static void		pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
142 static void		pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
143 static void		pg_cmt_cpu_active(cpu_t *);
144 static void		pg_cmt_cpu_inactive(cpu_t *);
145 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
146 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
147 static char		*pg_cmt_policy_name(pg_t *);
148 static void		pg_cmt_hier_sort(pg_cmt_t **, int);
149 static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
150 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
151 static int		pg_cmt_hw(pghw_type_t);
152 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
153 static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
154 static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
155 			    kthread_t *, kthread_t *);
156 static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
157 			    kthread_t *, kthread_t *);
158 static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
159 static cmt_lineage_validation_t	pg_cmt_lineage_validate(pg_cmt_t **, int *,
160 			    cpu_pg_t *);
161 
162 
163 /*
164  * CMT PG ops
165  */
166 struct pg_ops pg_ops_cmt = {
167 	pg_cmt_alloc,
168 	pg_cmt_free,
169 	pg_cmt_cpu_init,
170 	pg_cmt_cpu_fini,
171 	pg_cmt_cpu_active,
172 	pg_cmt_cpu_inactive,
173 	pg_cmt_cpupart_in,
174 	NULL,			/* cpupart_out */
175 	pg_cmt_cpupart_move,
176 	pg_cmt_cpu_belongs,
177 	pg_cmt_policy_name,
178 };
179 
180 /*
181  * Initialize the CMT PG class
182  */
183 void
184 pg_cmt_class_init(void)
185 {
186 	if (cmt_sched_disabled)
187 		return;
188 
189 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
190 }
191 
192 /*
193  * Called to indicate a new CPU has started up so
194  * that either t0 or the slave startup thread can
195  * be accounted for.
196  */
197 void
198 pg_cmt_cpu_startup(cpu_t *cp)
199 {
200 	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
201 	    cp->cpu_thread);
202 }
203 
204 /*
205  * Return non-zero if thread can migrate between "from" and "to"
206  * without a performance penalty
207  */
208 int
209 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
210 {
211 	if (from->cpu_physid->cpu_cacheid ==
212 	    to->cpu_physid->cpu_cacheid)
213 		return (1);
214 	return (0);
215 }
216 
217 /*
218  * CMT class specific PG allocation
219  */
220 static pg_t *
221 pg_cmt_alloc(void)
222 {
223 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
224 }
225 
226 /*
227  * Class specific PG de-allocation
228  */
229 static void
230 pg_cmt_free(pg_t *pg)
231 {
232 	ASSERT(pg != NULL);
233 	ASSERT(IS_CMT_PG(pg));
234 
235 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
236 }
237 
238 /*
239  * Given a hardware sharing relationship, return which dispatcher
240  * policies should be implemented to optimize performance and efficiency
241  */
242 static pg_cmt_policy_t
243 pg_cmt_policy(pghw_type_t hw)
244 {
245 	pg_cmt_policy_t p;
246 
247 	/*
248 	 * Give the platform a chance to override the default
249 	 */
250 	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
251 		return (p);
252 
253 	switch (hw) {
254 	case PGHW_IPIPE:
255 	case PGHW_FPU:
256 	case PGHW_CHIP:
257 		return (CMT_BALANCE);
258 	case PGHW_CACHE:
259 		return (CMT_AFFINITY);
260 	case PGHW_POW_ACTIVE:
261 	case PGHW_POW_IDLE:
262 		return (CMT_BALANCE);
263 	default:
264 		return (CMT_NO_POLICY);
265 	}
266 }
267 
268 /*
269  * Rank the importance of optimizing for the pg1 relationship vs.
270  * the pg2 relationship.
271  */
272 static pg_cmt_t *
273 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
274 {
275 	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
276 	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
277 
278 	/*
279 	 * A power domain is only important if CPUPM is enabled.
280 	 */
281 	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
282 		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
283 			return (pg2);
284 		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
285 			return (pg1);
286 	}
287 
288 	/*
289 	 * Otherwise, ask the platform
290 	 */
291 	if (pg_plat_hw_rank(hw1, hw2) == hw1)
292 		return (pg1);
293 	else
294 		return (pg2);
295 }
296 
297 /*
298  * Initialize CMT callbacks for the given PG
299  */
300 static void
301 cmt_callback_init(pg_t *pg)
302 {
303 	/*
304 	 * Stick with the default callbacks if there isn't going to be
305 	 * any CMT thread placement optimizations implemented.
306 	 */
307 	if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY)
308 		return;
309 
310 	switch (((pghw_t *)pg)->pghw_hw) {
311 	case PGHW_POW_ACTIVE:
312 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
313 		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
314 		break;
315 	default:
316 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
317 
318 	}
319 }
320 
321 /*
322  * Promote PG above it's current parent.
323  * This is only legal if PG has an equal or greater number of CPUs than its
324  * parent.
325  *
326  * This routine operates on the CPU specific processor group data (for the CPUs
327  * in the PG being promoted), and may be invoked from a context where one CPU's
328  * PG data is under construction. In this case the argument "pgdata", if not
329  * NULL, is a reference to the CPU's under-construction PG data.
330  */
331 static void
332 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
333 {
334 	pg_cmt_t	*parent;
335 	group_t		*children;
336 	cpu_t		*cpu;
337 	group_iter_t	iter;
338 	pg_cpu_itr_t	cpu_iter;
339 	int		r;
340 	int		err;
341 
342 	ASSERT(MUTEX_HELD(&cpu_lock));
343 
344 	parent = pg->cmt_parent;
345 	if (parent == NULL) {
346 		/*
347 		 * Nothing to do
348 		 */
349 		return;
350 	}
351 
352 	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
353 
354 	/*
355 	 * We're changing around the hierarchy, which is actively traversed
356 	 * by the dispatcher. Pause CPUS to ensure exclusivity.
357 	 */
358 	pause_cpus(NULL);
359 
360 	/*
361 	 * If necessary, update the parent's sibling set, replacing parent
362 	 * with PG.
363 	 */
364 	if (parent->cmt_siblings) {
365 		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
366 		    != -1) {
367 			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
368 			ASSERT(r != -1);
369 		}
370 	}
371 
372 	/*
373 	 * If the parent is at the top of the hierarchy, replace it's entry
374 	 * in the root lgroup's group of top level PGs.
375 	 */
376 	if (parent->cmt_parent == NULL &&
377 	    parent->cmt_siblings != &cmt_root->cl_pgs) {
378 		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
379 		    != -1) {
380 			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
381 			ASSERT(r != -1);
382 		}
383 	}
384 
385 	/*
386 	 * We assume (and therefore assert) that the PG being promoted is an
387 	 * only child of it's parent. Update the parent's children set
388 	 * replacing PG's entry with the parent (since the parent is becoming
389 	 * the child). Then have PG and the parent swap children sets.
390 	 */
391 	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
392 	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
393 		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
394 		ASSERT(r != -1);
395 	}
396 
397 	children = pg->cmt_children;
398 	pg->cmt_children = parent->cmt_children;
399 	parent->cmt_children = children;
400 
401 	/*
402 	 * Update the sibling references for PG and it's parent
403 	 */
404 	pg->cmt_siblings = parent->cmt_siblings;
405 	parent->cmt_siblings = pg->cmt_children;
406 
407 	/*
408 	 * Update any cached lineages in the per CPU pg data.
409 	 */
410 	PG_CPU_ITR_INIT(pg, cpu_iter);
411 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
412 		int		idx;
413 		pg_cmt_t	*cpu_pg;
414 		cpu_pg_t	*pgd;	/* CPU's PG data */
415 
416 		/*
417 		 * The CPU's whose lineage is under construction still
418 		 * references the bootstrap CPU PG data structure.
419 		 */
420 		if (pg_cpu_is_bootstrapped(cpu))
421 			pgd = pgdata;
422 		else
423 			pgd = cpu->cpu_pg;
424 
425 		/*
426 		 * Iterate over the CPU's PGs updating the children
427 		 * of the PG being promoted, since they have a new parent.
428 		 */
429 		group_iter_init(&iter);
430 		while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
431 			if (cpu_pg->cmt_parent == pg) {
432 				cpu_pg->cmt_parent = parent;
433 			}
434 		}
435 
436 		/*
437 		 * Update the CMT load balancing lineage
438 		 */
439 		if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
440 			/*
441 			 * Unless this is the CPU who's lineage is being
442 			 * constructed, the PG being promoted should be
443 			 * in the lineage.
444 			 */
445 			ASSERT(pg_cpu_is_bootstrapped(cpu));
446 			continue;
447 		}
448 
449 		ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);
450 		ASSERT(idx > 0);
451 
452 		/*
453 		 * Have the child and the parent swap places in the CPU's
454 		 * lineage
455 		 */
456 		group_remove_at(&pgd->cmt_pgs, idx);
457 		group_remove_at(&pgd->cmt_pgs, idx - 1);
458 		err = group_add_at(&pgd->cmt_pgs, parent, idx);
459 		ASSERT(err == 0);
460 		err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
461 		ASSERT(err == 0);
462 	}
463 
464 	/*
465 	 * Update the parent references for PG and it's parent
466 	 */
467 	pg->cmt_parent = parent->cmt_parent;
468 	parent->cmt_parent = pg;
469 
470 	start_cpus();
471 }
472 
473 /*
474  * CMT class callback for a new CPU entering the system
475  *
476  * This routine operates on the CPU specific processor group data (for the CPU
477  * being initialized). The argument "pgdata" is a reference to the CPU's PG
478  * data to be constructed.
479  *
480  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
481  * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
482  * calls must be careful to operate only on the "pgdata" argument, and not
483  * cp->cpu_pg.
484  */
485 static void
486 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
487 {
488 	pg_cmt_t	*pg;
489 	group_t		*cmt_pgs;
490 	int		levels, level;
491 	pghw_type_t	hw;
492 	pg_t		*pg_cache = NULL;
493 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
494 	lgrp_handle_t	lgrp_handle;
495 	cmt_lgrp_t	*lgrp;
496 	cmt_lineage_validation_t	lineage_status;
497 
498 	ASSERT(MUTEX_HELD(&cpu_lock));
499 	ASSERT(pg_cpu_is_bootstrapped(cp));
500 
501 	if (cmt_sched_disabled)
502 		return;
503 
504 	/*
505 	 * A new CPU is coming into the system.
506 	 * Interrogate the platform to see if the CPU
507 	 * has any performance or efficiency relevant
508 	 * sharing relationships
509 	 */
510 	cmt_pgs = &pgdata->cmt_pgs;
511 	pgdata->cmt_lineage = NULL;
512 
513 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
514 	levels = 0;
515 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
516 
517 		pg_cmt_policy_t	policy;
518 
519 		/*
520 		 * We're only interested in the hw sharing relationships
521 		 * for which we know how to optimize.
522 		 */
523 		policy = pg_cmt_policy(hw);
524 		if (policy == CMT_NO_POLICY ||
525 		    pg_plat_hw_shared(cp, hw) == 0)
526 			continue;
527 
528 		/*
529 		 * We will still create the PGs for hardware sharing
530 		 * relationships that have been blacklisted, but won't
531 		 * implement CMT thread placement optimizations against them.
532 		 */
533 		if (cmt_hw_blacklisted[hw] == 1)
534 			policy = CMT_NO_POLICY;
535 
536 		/*
537 		 * Find (or create) the PG associated with
538 		 * the hw sharing relationship in which cp
539 		 * belongs.
540 		 *
541 		 * Determine if a suitable PG already
542 		 * exists, or if one needs to be created.
543 		 */
544 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
545 		if (pg == NULL) {
546 			/*
547 			 * Create a new one.
548 			 * Initialize the common...
549 			 */
550 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
551 
552 			/* ... physical ... */
553 			pghw_init((pghw_t *)pg, cp, hw);
554 
555 			/*
556 			 * ... and CMT specific portions of the
557 			 * structure.
558 			 */
559 			pg->cmt_policy = policy;
560 
561 			/* CMT event callbacks */
562 			cmt_callback_init((pg_t *)pg);
563 
564 			bitset_init(&pg->cmt_cpus_actv_set);
565 			group_create(&pg->cmt_cpus_actv);
566 		} else {
567 			ASSERT(IS_CMT_PG(pg));
568 		}
569 
570 		/* Add the CPU to the PG */
571 		pg_cpu_add((pg_t *)pg, cp, pgdata);
572 
573 		/*
574 		 * Ensure capacity of the active CPU group/bitset
575 		 */
576 		group_expand(&pg->cmt_cpus_actv,
577 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
578 
579 		if (cp->cpu_seqid >=
580 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
581 			bitset_resize(&pg->cmt_cpus_actv_set,
582 			    cp->cpu_seqid + 1);
583 		}
584 
585 		/*
586 		 * Build a lineage of CMT PGs for load balancing / coalescence
587 		 */
588 		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
589 			cpu_cmt_hier[levels++] = pg;
590 		}
591 
592 		/* Cache this for later */
593 		if (hw == PGHW_CACHE)
594 			pg_cache = (pg_t *)pg;
595 	}
596 
597 	group_expand(cmt_pgs, levels);
598 
599 	if (cmt_root == NULL)
600 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
601 
602 	/*
603 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
604 	 */
605 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
606 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
607 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
608 
609 	/*
610 	 * Ascendingly sort the PGs in the lineage by number of CPUs
611 	 */
612 	pg_cmt_hier_sort(cpu_cmt_hier, levels);
613 
614 	/*
615 	 * Examine the lineage and validate it.
616 	 * This routine will also try to fix the lineage along with the
617 	 * rest of the PG hierarchy should it detect an issue.
618 	 *
619 	 * If it returns anything other than VALID or REPAIRED, an
620 	 * unrecoverable error has occurred, and we cannot proceed.
621 	 */
622 	lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
623 	if ((lineage_status != CMT_LINEAGE_VALID) &&
624 	    (lineage_status != CMT_LINEAGE_REPAIRED)) {
625 		/*
626 		 * In the case of an unrecoverable error where CMT scheduling
627 		 * has been disabled, assert that the under construction CPU's
628 		 * PG data has an empty CMT load balancing lineage.
629 		 */
630 		ASSERT((cmt_sched_disabled == 0) ||
631 		    (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
632 		return;
633 	}
634 
635 	/*
636 	 * For existing PGs in the lineage, verify that the parent is
637 	 * correct, as the generation in the lineage may have changed
638 	 * as a result of the sorting. Start the traversal at the top
639 	 * of the lineage, moving down.
640 	 */
641 	for (level = levels - 1; level >= 0; ) {
642 		int reorg;
643 
644 		reorg = 0;
645 		pg = cpu_cmt_hier[level];
646 
647 		/*
648 		 * Promote PGs at an incorrect generation into place.
649 		 */
650 		while (pg->cmt_parent &&
651 		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
652 			cmt_hier_promote(pg, pgdata);
653 			reorg++;
654 		}
655 		if (reorg > 0)
656 			level = levels - 1;
657 		else
658 			level--;
659 	}
660 
661 	/*
662 	 * For each of the PGs in the CPU's lineage:
663 	 *	- Add an entry in the CPU sorted CMT PG group
664 	 *	  which is used for top down CMT load balancing
665 	 *	- Tie the PG into the CMT hierarchy by connecting
666 	 *	  it to it's parent and siblings.
667 	 */
668 	for (level = 0; level < levels; level++) {
669 		uint_t		children;
670 		int		err;
671 
672 		pg = cpu_cmt_hier[level];
673 		err = group_add_at(cmt_pgs, pg, levels - level - 1);
674 		ASSERT(err == 0);
675 
676 		if (level == 0)
677 			pgdata->cmt_lineage = (pg_t *)pg;
678 
679 		if (pg->cmt_siblings != NULL) {
680 			/* Already initialized */
681 			ASSERT(pg->cmt_parent == NULL ||
682 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
683 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
684 			    ((pg->cmt_parent != NULL) &&
685 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
686 			continue;
687 		}
688 
689 		if ((level + 1) == levels) {
690 			pg->cmt_parent = NULL;
691 
692 			pg->cmt_siblings = &lgrp->cl_pgs;
693 			children = ++lgrp->cl_npgs;
694 			if (cmt_root != lgrp)
695 				cmt_root->cl_npgs++;
696 		} else {
697 			pg->cmt_parent = cpu_cmt_hier[level + 1];
698 
699 			/*
700 			 * A good parent keeps track of their children.
701 			 * The parent's children group is also the PG's
702 			 * siblings.
703 			 */
704 			if (pg->cmt_parent->cmt_children == NULL) {
705 				pg->cmt_parent->cmt_children =
706 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
707 				group_create(pg->cmt_parent->cmt_children);
708 			}
709 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
710 			children = ++pg->cmt_parent->cmt_nchildren;
711 		}
712 
713 		group_expand(pg->cmt_siblings, children);
714 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
715 	}
716 
717 	/*
718 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
719 	 * for fast lookups later.
720 	 */
721 	if (cp->cpu_physid) {
722 		cp->cpu_physid->cpu_chipid =
723 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
724 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
725 
726 		/*
727 		 * If this cpu has a PG representing shared cache, then set
728 		 * cpu_cacheid to that PG's logical id
729 		 */
730 		if (pg_cache)
731 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
732 	}
733 
734 	/* CPU0 only initialization */
735 	if (is_cpu0) {
736 		is_cpu0 = 0;
737 		cpu0_lgrp = lgrp;
738 	}
739 
740 }
741 
742 /*
743  * Class callback when a CPU is leaving the system (deletion)
744  *
745  * "pgdata" is a reference to the CPU's PG data to be deconstructed.
746  *
747  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
748  * references a "bootstrap" structure across this function's invocation.
749  * pg_cmt_cpu_init() and the routines it calls must be careful to operate only
750  * on the "pgdata" argument, and not cp->cpu_pg.
751  */
752 static void
753 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
754 {
755 	group_iter_t	i;
756 	pg_cmt_t	*pg;
757 	group_t		*pgs, *cmt_pgs;
758 	lgrp_handle_t	lgrp_handle;
759 	cmt_lgrp_t	*lgrp;
760 
761 	if (cmt_sched_disabled)
762 		return;
763 
764 	ASSERT(pg_cpu_is_bootstrapped(cp));
765 
766 	pgs = &pgdata->pgs;
767 	cmt_pgs = &pgdata->cmt_pgs;
768 
769 	/*
770 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
771 	 */
772 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
773 
774 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
775 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
776 		/*
777 		 * One might wonder how we could be deconfiguring the
778 		 * only CPU in the system.
779 		 *
780 		 * On Starcat systems when null_proc_lpa is detected,
781 		 * the boot CPU (which is already configured into a leaf
782 		 * lgroup), is moved into the root lgroup. This is done by
783 		 * deconfiguring it from both lgroups and processor
784 		 * groups), and then later reconfiguring it back in.  This
785 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
786 		 *
787 		 * This special case is detected by noting that the platform
788 		 * has changed the CPU's lgrp affiliation (since it now
789 		 * belongs in the root). In this case, use the cmt_lgrp_t
790 		 * cached for the boot CPU, since this is what needs to be
791 		 * torn down.
792 		 */
793 		lgrp = cpu0_lgrp;
794 	}
795 
796 	ASSERT(lgrp != NULL);
797 
798 	/*
799 	 * First, clean up anything load balancing specific for each of
800 	 * the CPU's PGs that participated in CMT load balancing
801 	 */
802 	pg = (pg_cmt_t *)pgdata->cmt_lineage;
803 	while (pg != NULL) {
804 
805 		/*
806 		 * Remove the PG from the CPU's load balancing lineage
807 		 */
808 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
809 
810 		/*
811 		 * If it's about to become empty, destroy it's children
812 		 * group, and remove it's reference from it's siblings.
813 		 * This is done here (rather than below) to avoid removing
814 		 * our reference from a PG that we just eliminated.
815 		 */
816 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
817 			if (pg->cmt_children != NULL)
818 				group_destroy(pg->cmt_children);
819 			if (pg->cmt_siblings != NULL) {
820 				if (pg->cmt_siblings == &lgrp->cl_pgs)
821 					lgrp->cl_npgs--;
822 				else
823 					pg->cmt_parent->cmt_nchildren--;
824 			}
825 		}
826 		pg = pg->cmt_parent;
827 	}
828 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
829 
830 	/*
831 	 * Now that the load balancing lineage updates have happened,
832 	 * remove the CPU from all it's PGs (destroying any that become
833 	 * empty).
834 	 */
835 	group_iter_init(&i);
836 	while ((pg = group_iterate(pgs, &i)) != NULL) {
837 		if (IS_CMT_PG(pg) == 0)
838 			continue;
839 
840 		pg_cpu_delete((pg_t *)pg, cp, pgdata);
841 		/*
842 		 * Deleting the CPU from the PG changes the CPU's
843 		 * PG group over which we are actively iterating
844 		 * Re-initialize the iteration
845 		 */
846 		group_iter_init(&i);
847 
848 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
849 
850 			/*
851 			 * The PG has become zero sized, so destroy it.
852 			 */
853 			group_destroy(&pg->cmt_cpus_actv);
854 			bitset_fini(&pg->cmt_cpus_actv_set);
855 			pghw_fini((pghw_t *)pg);
856 
857 			pg_destroy((pg_t *)pg);
858 		}
859 	}
860 }
861 
862 /*
863  * Class callback when a CPU is entering a cpu partition
864  */
865 static void
866 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
867 {
868 	group_t		*pgs;
869 	pg_t		*pg;
870 	group_iter_t	i;
871 
872 	ASSERT(MUTEX_HELD(&cpu_lock));
873 
874 	if (cmt_sched_disabled)
875 		return;
876 
877 	pgs = &cp->cpu_pg->pgs;
878 
879 	/*
880 	 * Ensure that the new partition's PG bitset
881 	 * is large enough for all CMT PG's to which cp
882 	 * belongs
883 	 */
884 	group_iter_init(&i);
885 	while ((pg = group_iterate(pgs, &i)) != NULL) {
886 		if (IS_CMT_PG(pg) == 0)
887 			continue;
888 
889 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
890 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
891 	}
892 }
893 
894 /*
895  * Class callback when a CPU is actually moving partitions
896  */
897 static void
898 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
899 {
900 	cpu_t		*cpp;
901 	group_t		*pgs;
902 	pg_t		*pg;
903 	group_iter_t	pg_iter;
904 	pg_cpu_itr_t	cpu_iter;
905 	boolean_t	found;
906 
907 	ASSERT(MUTEX_HELD(&cpu_lock));
908 
909 	if (cmt_sched_disabled)
910 		return;
911 
912 	pgs = &cp->cpu_pg->pgs;
913 	group_iter_init(&pg_iter);
914 
915 	/*
916 	 * Iterate over the CPUs CMT PGs
917 	 */
918 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
919 
920 		if (IS_CMT_PG(pg) == 0)
921 			continue;
922 
923 		/*
924 		 * Add the PG to the bitset in the new partition.
925 		 */
926 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
927 
928 		/*
929 		 * Remove the PG from the bitset in the old partition
930 		 * if the last of the PG's CPUs have left.
931 		 */
932 		found = B_FALSE;
933 		PG_CPU_ITR_INIT(pg, cpu_iter);
934 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
935 			if (cpp == cp)
936 				continue;
937 			if (CPU_ACTIVE(cpp) &&
938 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
939 				found = B_TRUE;
940 				break;
941 			}
942 		}
943 		if (!found)
944 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
945 	}
946 }
947 
948 /*
949  * Class callback when a CPU becomes active (online)
950  *
951  * This is called in a context where CPUs are paused
952  */
953 static void
954 pg_cmt_cpu_active(cpu_t *cp)
955 {
956 	int		err;
957 	group_iter_t	i;
958 	pg_cmt_t	*pg;
959 	group_t		*pgs;
960 
961 	ASSERT(MUTEX_HELD(&cpu_lock));
962 
963 	if (cmt_sched_disabled)
964 		return;
965 
966 	pgs = &cp->cpu_pg->pgs;
967 	group_iter_init(&i);
968 
969 	/*
970 	 * Iterate over the CPU's PGs
971 	 */
972 	while ((pg = group_iterate(pgs, &i)) != NULL) {
973 
974 		if (IS_CMT_PG(pg) == 0)
975 			continue;
976 
977 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
978 		ASSERT(err == 0);
979 
980 		/*
981 		 * If this is the first active CPU in the PG, and it
982 		 * represents a hardware sharing relationship over which
983 		 * CMT load balancing is performed, add it as a candidate
984 		 * for balancing with it's siblings.
985 		 */
986 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
987 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
988 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
989 			ASSERT(err == 0);
990 
991 			/*
992 			 * If this is a top level PG, add it as a balancing
993 			 * candidate when balancing within the root lgroup.
994 			 */
995 			if (pg->cmt_parent == NULL &&
996 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
997 				err = group_add(&cmt_root->cl_pgs, pg,
998 				    GRP_NORESIZE);
999 				ASSERT(err == 0);
1000 			}
1001 		}
1002 
1003 		/*
1004 		 * Notate the CPU in the PGs active CPU bitset.
1005 		 * Also notate the PG as being active in it's associated
1006 		 * partition
1007 		 */
1008 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1009 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
1010 	}
1011 }
1012 
1013 /*
1014  * Class callback when a CPU goes inactive (offline)
1015  *
1016  * This is called in a context where CPUs are paused
1017  */
1018 static void
1019 pg_cmt_cpu_inactive(cpu_t *cp)
1020 {
1021 	int		err;
1022 	group_t		*pgs;
1023 	pg_cmt_t	*pg;
1024 	cpu_t		*cpp;
1025 	group_iter_t	i;
1026 	pg_cpu_itr_t	cpu_itr;
1027 	boolean_t	found;
1028 
1029 	ASSERT(MUTEX_HELD(&cpu_lock));
1030 
1031 	if (cmt_sched_disabled)
1032 		return;
1033 
1034 	pgs = &cp->cpu_pg->pgs;
1035 	group_iter_init(&i);
1036 
1037 	while ((pg = group_iterate(pgs, &i)) != NULL) {
1038 
1039 		if (IS_CMT_PG(pg) == 0)
1040 			continue;
1041 
1042 		/*
1043 		 * Remove the CPU from the CMT PGs active CPU group
1044 		 * bitmap
1045 		 */
1046 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1047 		ASSERT(err == 0);
1048 
1049 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1050 
1051 		/*
1052 		 * If there are no more active CPUs in this PG over which
1053 		 * load was balanced, remove it as a balancing candidate.
1054 		 */
1055 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
1056 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1057 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1058 			ASSERT(err == 0);
1059 
1060 			if (pg->cmt_parent == NULL &&
1061 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1062 				err = group_remove(&cmt_root->cl_pgs, pg,
1063 				    GRP_NORESIZE);
1064 				ASSERT(err == 0);
1065 			}
1066 		}
1067 
1068 		/*
1069 		 * Assert the number of active CPUs does not exceed
1070 		 * the total number of CPUs in the PG
1071 		 */
1072 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1073 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1074 
1075 		/*
1076 		 * Update the PG bitset in the CPU's old partition
1077 		 */
1078 		found = B_FALSE;
1079 		PG_CPU_ITR_INIT(pg, cpu_itr);
1080 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1081 			if (cpp == cp)
1082 				continue;
1083 			if (CPU_ACTIVE(cpp) &&
1084 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1085 				found = B_TRUE;
1086 				break;
1087 			}
1088 		}
1089 		if (!found) {
1090 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
1091 			    ((pg_t *)pg)->pg_id);
1092 		}
1093 	}
1094 }
1095 
1096 /*
1097  * Return non-zero if the CPU belongs in the given PG
1098  */
1099 static int
1100 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1101 {
1102 	cpu_t	*pg_cpu;
1103 
1104 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1105 
1106 	ASSERT(pg_cpu != NULL);
1107 
1108 	/*
1109 	 * The CPU belongs if, given the nature of the hardware sharing
1110 	 * relationship represented by the PG, the CPU has that
1111 	 * relationship with some other CPU already in the PG
1112 	 */
1113 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1114 		return (1);
1115 
1116 	return (0);
1117 }
1118 
1119 /*
1120  * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1121  */
1122 static void
1123 pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1124 {
1125 	int		i, j, inc;
1126 	pg_t		*tmp;
1127 	pg_t		**h = (pg_t **)hier;
1128 
1129 	/*
1130 	 * First sort by number of CPUs
1131 	 */
1132 	inc = size / 2;
1133 	while (inc > 0) {
1134 		for (i = inc; i < size; i++) {
1135 			j = i;
1136 			tmp = h[i];
1137 			while ((j >= inc) &&
1138 			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
1139 				h[j] = h[j - inc];
1140 				j = j - inc;
1141 			}
1142 			h[j] = tmp;
1143 		}
1144 		if (inc == 2)
1145 			inc = 1;
1146 		else
1147 			inc = (inc * 5) / 11;
1148 	}
1149 
1150 	/*
1151 	 * Break ties by asking the platform.
1152 	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
1153 	 */
1154 	for (i = 0; i < size - 1; i++) {
1155 		if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) &&
1156 		    pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) {
1157 			tmp = h[i];
1158 			h[i] = h[i + 1];
1159 			h[i + 1] = tmp;
1160 		}
1161 	}
1162 }
1163 
1164 /*
1165  * Return a cmt_lgrp_t * given an lgroup handle.
1166  */
1167 static cmt_lgrp_t *
1168 pg_cmt_find_lgrp(lgrp_handle_t hand)
1169 {
1170 	cmt_lgrp_t	*lgrp;
1171 
1172 	ASSERT(MUTEX_HELD(&cpu_lock));
1173 
1174 	lgrp = cmt_lgrps;
1175 	while (lgrp != NULL) {
1176 		if (lgrp->cl_hand == hand)
1177 			break;
1178 		lgrp = lgrp->cl_next;
1179 	}
1180 	return (lgrp);
1181 }
1182 
1183 /*
1184  * Create a cmt_lgrp_t with the specified handle.
1185  */
1186 static cmt_lgrp_t *
1187 pg_cmt_lgrp_create(lgrp_handle_t hand)
1188 {
1189 	cmt_lgrp_t	*lgrp;
1190 
1191 	ASSERT(MUTEX_HELD(&cpu_lock));
1192 
1193 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1194 
1195 	lgrp->cl_hand = hand;
1196 	lgrp->cl_npgs = 0;
1197 	lgrp->cl_next = cmt_lgrps;
1198 	cmt_lgrps = lgrp;
1199 	group_create(&lgrp->cl_pgs);
1200 
1201 	return (lgrp);
1202 }
1203 
1204 /*
1205  * Interfaces to enable and disable power aware dispatching
1206  * The caller must be holding cpu_lock.
1207  *
1208  * Return 0 on success and -1 on failure.
1209  */
1210 int
1211 cmt_pad_enable(pghw_type_t type)
1212 {
1213 	group_t		*hwset;
1214 	group_iter_t	iter;
1215 	pg_cmt_t	*pg;
1216 
1217 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1218 	ASSERT(MUTEX_HELD(&cpu_lock));
1219 
1220 	if ((hwset = pghw_set_lookup(type)) == NULL ||
1221 	    cmt_hw_blacklisted[type]) {
1222 		/*
1223 		 * Unable to find any instances of the specified type
1224 		 * of power domain, or the power domains have been blacklisted.
1225 		 */
1226 		return (-1);
1227 	}
1228 
1229 	/*
1230 	 * Iterate over the power domains, setting the default dispatcher
1231 	 * policy for power/performance optimization.
1232 	 *
1233 	 * Simply setting the policy isn't enough in the case where the power
1234 	 * domain is an only child of another PG. Because the dispatcher walks
1235 	 * the PG hierarchy in a top down fashion, the higher up PG's policy
1236 	 * will dominate. So promote the power domain above it's parent if both
1237 	 * PG and it's parent have the same CPUs to ensure it's policy
1238 	 * dominates.
1239 	 */
1240 	group_iter_init(&iter);
1241 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1242 		/*
1243 		 * If the power domain is an only child to a parent
1244 		 * not implementing the same policy, promote the child
1245 		 * above the parent to activate the policy.
1246 		 */
1247 		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
1248 		while ((pg->cmt_parent != NULL) &&
1249 		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
1250 		    (PG_NUM_CPUS((pg_t *)pg) ==
1251 		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
1252 			cmt_hier_promote(pg, NULL);
1253 		}
1254 	}
1255 
1256 	return (0);
1257 }
1258 
1259 int
1260 cmt_pad_disable(pghw_type_t type)
1261 {
1262 	group_t		*hwset;
1263 	group_iter_t	iter;
1264 	pg_cmt_t	*pg;
1265 	pg_cmt_t	*child;
1266 
1267 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1268 	ASSERT(MUTEX_HELD(&cpu_lock));
1269 
1270 	if ((hwset = pghw_set_lookup(type)) == NULL) {
1271 		/*
1272 		 * Unable to find any instances of the specified type of
1273 		 * power domain.
1274 		 */
1275 		return (-1);
1276 	}
1277 	/*
1278 	 * Iterate over the power domains, setting the default dispatcher
1279 	 * policy for performance optimization (load balancing).
1280 	 */
1281 	group_iter_init(&iter);
1282 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1283 
1284 		/*
1285 		 * If the power domain has an only child that implements
1286 		 * policy other than load balancing, promote the child
1287 		 * above the power domain to ensure it's policy dominates.
1288 		 */
1289 		if (pg->cmt_children != NULL &&
1290 		    GROUP_SIZE(pg->cmt_children) == 1) {
1291 			child = GROUP_ACCESS(pg->cmt_children, 0);
1292 			if ((child->cmt_policy & CMT_BALANCE) == 0) {
1293 				cmt_hier_promote(child, NULL);
1294 			}
1295 		}
1296 		pg->cmt_policy = CMT_BALANCE;
1297 	}
1298 	return (0);
1299 }
1300 
1301 /* ARGSUSED */
1302 static void
1303 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1304 		    kthread_t *new)
1305 {
1306 	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;
1307 
1308 	if (old == cp->cpu_idle_thread) {
1309 		atomic_add_32(&cmt_pg->cmt_utilization, 1);
1310 	} else if (new == cp->cpu_idle_thread) {
1311 		atomic_add_32(&cmt_pg->cmt_utilization, -1);
1312 	}
1313 }
1314 
1315 /*
1316  * Macro to test whether a thread is currently runnable on a CPU in a PG.
1317  */
1318 #define	THREAD_RUNNABLE_IN_PG(t, pg)					\
1319 	((t)->t_state == TS_RUN &&					\
1320 	    (t)->t_disp_queue->disp_cpu &&				\
1321 	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
1322 	    (t)->t_disp_queue->disp_cpu->cpu_seqid))
1323 
1324 static void
1325 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1326     kthread_t *new)
1327 {
1328 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1329 	cpupm_domain_t	*dom;
1330 	uint32_t	u;
1331 
1332 	if (old == cp->cpu_idle_thread) {
1333 		ASSERT(new != cp->cpu_idle_thread);
1334 		u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
1335 		if (u == 1) {
1336 			/*
1337 			 * Notify the CPU power manager that the domain
1338 			 * is non-idle.
1339 			 */
1340 			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1341 			cpupm_utilization_event(cp, now, dom,
1342 			    CPUPM_DOM_BUSY_FROM_IDLE);
1343 		}
1344 	} else if (new == cp->cpu_idle_thread) {
1345 		ASSERT(old != cp->cpu_idle_thread);
1346 		u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
1347 		if (u == 0) {
1348 			/*
1349 			 * The domain is idle, notify the CPU power
1350 			 * manager.
1351 			 *
1352 			 * Avoid notifying if the thread is simply migrating
1353 			 * between CPUs in the domain.
1354 			 */
1355 			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
1356 				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1357 				cpupm_utilization_event(cp, now, dom,
1358 				    CPUPM_DOM_IDLE_FROM_BUSY);
1359 			}
1360 		}
1361 	}
1362 }
1363 
1364 /* ARGSUSED */
1365 static void
1366 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
1367 {
1368 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1369 	cpupm_domain_t	*dom;
1370 
1371 	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1372 	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
1373 }
1374 
1375 /*
1376  * Return the name of the CMT scheduling policy
1377  * being implemented across this PG
1378  */
1379 static char *
1380 pg_cmt_policy_name(pg_t *pg)
1381 {
1382 	pg_cmt_policy_t policy;
1383 
1384 	policy = ((pg_cmt_t *)pg)->cmt_policy;
1385 
1386 	if (policy & CMT_AFFINITY) {
1387 		if (policy & CMT_BALANCE)
1388 			return ("Load Balancing & Affinity");
1389 		else if (policy & CMT_COALESCE)
1390 			return ("Load Coalescence & Affinity");
1391 		else
1392 			return ("Affinity");
1393 	} else {
1394 		if (policy & CMT_BALANCE)
1395 			return ("Load Balancing");
1396 		else if (policy & CMT_COALESCE)
1397 			return ("Load Coalescence");
1398 		else
1399 			return ("None");
1400 	}
1401 }
1402 
1403 /*
1404  * Prune PG, and all other instances of PG's hardware sharing relationship
1405  * from the CMT PG hierarchy.
1406  *
1407  * This routine operates on the CPU specific processor group data (for the CPUs
1408  * in the PG being pruned), and may be invoked from a context where one CPU's
1409  * PG data is under construction. In this case the argument "pgdata", if not
1410  * NULL, is a reference to the CPU's under-construction PG data.
1411  */
1412 static int
1413 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1414 {
1415 	group_t		*hwset, *children;
1416 	int		i, j, r, size = *sz;
1417 	group_iter_t	hw_iter, child_iter;
1418 	pg_cpu_itr_t	cpu_iter;
1419 	pg_cmt_t	*pg, *child;
1420 	cpu_t		*cpu;
1421 	int		cap_needed;
1422 	pghw_type_t	hw;
1423 
1424 	ASSERT(MUTEX_HELD(&cpu_lock));
1425 
1426 	hw = ((pghw_t *)pg_bad)->pghw_hw;
1427 
1428 	if (hw == PGHW_POW_ACTIVE) {
1429 		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
1430 		    "Event Based CPUPM Unavailable");
1431 	} else if (hw == PGHW_POW_IDLE) {
1432 		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
1433 		    "Dispatcher assisted CPUPM disabled.");
1434 	}
1435 
1436 	/*
1437 	 * Find and eliminate the PG from the lineage.
1438 	 */
1439 	for (i = 0; i < size; i++) {
1440 		if (lineage[i] == pg_bad) {
1441 			for (j = i; j < size - 1; j++)
1442 				lineage[j] = lineage[j + 1];
1443 			*sz = size - 1;
1444 			break;
1445 		}
1446 	}
1447 
1448 	/*
1449 	 * We'll prune all instances of the hardware sharing relationship
1450 	 * represented by pg. But before we do that (and pause CPUs) we need
1451 	 * to ensure the hierarchy's groups are properly sized.
1452 	 */
1453 	hwset = pghw_set_lookup(hw);
1454 
1455 	/*
1456 	 * Blacklist the hardware so future processor groups of this type won't
1457 	 * participate in CMT thread placement.
1458 	 *
1459 	 * XXX
1460 	 * For heterogeneous system configurations, this might be overkill.
1461 	 * We may only need to blacklist the illegal PGs, and other instances
1462 	 * of this hardware sharing relationship may be ok.
1463 	 */
1464 	cmt_hw_blacklisted[hw] = 1;
1465 
1466 	/*
1467 	 * For each of the PGs being pruned, ensure sufficient capacity in
1468 	 * the siblings set for the PG's children
1469 	 */
1470 	group_iter_init(&hw_iter);
1471 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1472 		/*
1473 		 * PG is being pruned, but if it is bringing up more than
1474 		 * one child, ask for more capacity in the siblings group.
1475 		 */
1476 		cap_needed = 0;
1477 		if (pg->cmt_children &&
1478 		    GROUP_SIZE(pg->cmt_children) > 1) {
1479 			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
1480 
1481 			group_expand(pg->cmt_siblings,
1482 			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);
1483 
1484 			/*
1485 			 * If this is a top level group, also ensure the
1486 			 * capacity in the root lgrp level CMT grouping.
1487 			 */
1488 			if (pg->cmt_parent == NULL &&
1489 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1490 				group_expand(&cmt_root->cl_pgs,
1491 				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1492 				cmt_root->cl_npgs += cap_needed;
1493 			}
1494 		}
1495 	}
1496 
1497 	/*
1498 	 * We're operating on the PG hierarchy. Pause CPUs to ensure
1499 	 * exclusivity with respect to the dispatcher.
1500 	 */
1501 	pause_cpus(NULL);
1502 
1503 	/*
1504 	 * Prune all PG instances of the hardware sharing relationship
1505 	 * represented by pg.
1506 	 */
1507 	group_iter_init(&hw_iter);
1508 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1509 
1510 		/*
1511 		 * Remove PG from it's group of siblings, if it's there.
1512 		 */
1513 		if (pg->cmt_siblings) {
1514 			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1515 		}
1516 		if (pg->cmt_parent == NULL &&
1517 		    pg->cmt_siblings != &cmt_root->cl_pgs) {
1518 			(void) group_remove(&cmt_root->cl_pgs, pg,
1519 			    GRP_NORESIZE);
1520 		}
1521 
1522 		/*
1523 		 * Indicate that no CMT policy will be implemented across
1524 		 * this PG.
1525 		 */
1526 		pg->cmt_policy = CMT_NO_POLICY;
1527 
1528 		/*
1529 		 * Move PG's children from it's children set to it's parent's
1530 		 * children set. Note that the parent's children set, and PG's
1531 		 * siblings set are the same thing.
1532 		 *
1533 		 * Because we are iterating over the same group that we are
1534 		 * operating on (removing the children), first add all of PG's
1535 		 * children to the parent's children set, and once we are done
1536 		 * iterating, empty PG's children set.
1537 		 */
1538 		if (pg->cmt_children != NULL) {
1539 			children = pg->cmt_children;
1540 
1541 			group_iter_init(&child_iter);
1542 			while ((child = group_iterate(children, &child_iter))
1543 			    != NULL) {
1544 				if (pg->cmt_siblings != NULL) {
1545 					r = group_add(pg->cmt_siblings, child,
1546 					    GRP_NORESIZE);
1547 					ASSERT(r == 0);
1548 
1549 					if (pg->cmt_parent == NULL &&
1550 					    pg->cmt_siblings !=
1551 					    &cmt_root->cl_pgs) {
1552 						r = group_add(&cmt_root->cl_pgs,
1553 						    child, GRP_NORESIZE);
1554 						ASSERT(r == 0);
1555 					}
1556 				}
1557 			}
1558 			group_empty(pg->cmt_children);
1559 		}
1560 
1561 		/*
1562 		 * Reset the callbacks to the defaults
1563 		 */
1564 		pg_callback_set_defaults((pg_t *)pg);
1565 
1566 		/*
1567 		 * Update all the CPU lineages in each of PG's CPUs
1568 		 */
1569 		PG_CPU_ITR_INIT(pg, cpu_iter);
1570 		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1571 			pg_cmt_t	*cpu_pg;
1572 			group_iter_t	liter;	/* Iterator for the lineage */
1573 			cpu_pg_t	*cpd;	/* CPU's PG data */
1574 
1575 			/*
1576 			 * The CPU's lineage is under construction still
1577 			 * references the bootstrap CPU PG data structure.
1578 			 */
1579 			if (pg_cpu_is_bootstrapped(cpu))
1580 				cpd = pgdata;
1581 			else
1582 				cpd = cpu->cpu_pg;
1583 
1584 			/*
1585 			 * Iterate over the CPU's PGs updating the children
1586 			 * of the PG being promoted, since they have a new
1587 			 * parent and siblings set.
1588 			 */
1589 			group_iter_init(&liter);
1590 			while ((cpu_pg = group_iterate(&cpd->pgs,
1591 			    &liter)) != NULL) {
1592 				if (cpu_pg->cmt_parent == pg) {
1593 					cpu_pg->cmt_parent = pg->cmt_parent;
1594 					cpu_pg->cmt_siblings = pg->cmt_siblings;
1595 				}
1596 			}
1597 
1598 			/*
1599 			 * Update the CPU's lineages
1600 			 *
1601 			 * Remove the PG from the CPU's group used for CMT
1602 			 * scheduling.
1603 			 */
1604 			(void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
1605 		}
1606 	}
1607 	start_cpus();
1608 	return (0);
1609 }
1610 
1611 /*
1612  * Disable CMT scheduling
1613  */
1614 static void
1615 pg_cmt_disable(void)
1616 {
1617 	cpu_t		*cpu;
1618 
1619 	ASSERT(MUTEX_HELD(&cpu_lock));
1620 
1621 	pause_cpus(NULL);
1622 	cpu = cpu_list;
1623 
1624 	do {
1625 		if (cpu->cpu_pg)
1626 			group_empty(&cpu->cpu_pg->cmt_pgs);
1627 	} while ((cpu = cpu->cpu_next) != cpu_list);
1628 
1629 	cmt_sched_disabled = 1;
1630 	start_cpus();
1631 	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
1632 }
1633 
1634 /*
1635  * CMT lineage validation
1636  *
1637  * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
1638  * of the PGs in a CPU's lineage. This is necessary because it's possible that
1639  * some groupings (power domain groupings in particular) may be defined by
1640  * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
1641  * possible to integrate those groupings into the CMT PG hierarchy, if doing
1642  * so would violate the subset invariant of the hierarchy, which says that
1643  * a PG must be subset of its parent (if it has one).
1644  *
1645  * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
1646  * would result in a violation of this invariant. If a violation is found,
1647  * and the PG is of a grouping type who's definition is known to originate from
1648  * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
1649  * PG (and all other instances PG's sharing relationship type) from the
1650  * hierarchy. Further, future instances of that sharing relationship type won't
1651  * be instantiated. If the grouping definition doesn't originate from suspect
1652  * sources, then pg_cmt_disable() will be invoked to log an error, and disable
1653  * CMT scheduling altogether.
1654  *
1655  * This routine is invoked after the CPU has been added to the PGs in which
1656  * it belongs, but before those PGs have been added to (or had their place
1657  * adjusted in) the CMT PG hierarchy.
1658  *
1659  * The first argument is the CPUs PG lineage (essentially an array of PGs in
1660  * which the CPU belongs) that has already been sorted in ascending order
1661  * by CPU count. Some of the PGs in the CPUs lineage may already have other
1662  * CPUs in them, and have already been integrated into the CMT hierarchy.
1663  *
1664  * The addition of this new CPU to these pre-existing PGs means that those
1665  * PGs may need to be promoted up in the hierarchy to satisfy the subset
1666  * invariant. In additon to testing the subset invariant for the lineage,
1667  * this routine also verifies that the addition of the new CPU to the
1668  * existing PGs wouldn't cause the subset invariant to be violated in
1669  * the exiting lineages.
1670  *
1671  * This routine will normally return one of the following:
1672  * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
1673  * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
1674  *
1675  * Otherwise, this routine will return a value indicating which error it
1676  * was unable to recover from (and set cmt_lineage_status along the way).
1677  *
1678  *
1679  * This routine operates on the CPU specific processor group data (for the CPU
1680  * whose lineage is being validated), which is under-construction.
1681  * "pgdata" is a reference to the CPU's under-construction PG data.
1682  * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
1683  */
1684 static cmt_lineage_validation_t
1685 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1686 {
1687 	int		i, j, size;
1688 	pg_cmt_t	*pg, *pg_next, *pg_bad, *pg_tmp;
1689 	cpu_t		*cp;
1690 	pg_cpu_itr_t	cpu_iter;
1691 	lgrp_handle_t	lgrp;
1692 
1693 	ASSERT(MUTEX_HELD(&cpu_lock));
1694 
1695 revalidate:
1696 	size = *sz;
1697 	pg_bad = NULL;
1698 	lgrp = LGRP_NULL_HANDLE;
1699 	for (i = 0; i < size; i++) {
1700 
1701 		pg = lineage[i];
1702 		if (i < size - 1)
1703 			pg_next = lineage[i + 1];
1704 		else
1705 			pg_next = NULL;
1706 
1707 		/*
1708 		 * We assume that the lineage has already been sorted
1709 		 * by the number of CPUs. In fact, we depend on it.
1710 		 */
1711 		ASSERT(pg_next == NULL ||
1712 		    (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));
1713 
1714 		/*
1715 		 * Check to make sure that the existing parent of PG (if any)
1716 		 * is either in the PG's lineage, or the PG has more CPUs than
1717 		 * its existing parent and can and should be promoted above its
1718 		 * parent.
1719 		 *
1720 		 * Since the PG topology is in the middle of being changed, we
1721 		 * need to check whether the PG's existing parent (if any) is
1722 		 * part of its lineage (and therefore should contain the new
1723 		 * CPU). If not, it means that the addition of the new CPU
1724 		 * should have made this PG have more CPUs than its parent, and
1725 		 * this PG should be promoted to be above its existing parent
1726 		 * now. We need to verify all of this to defend against a buggy
1727 		 * BIOS giving bad power domain CPU groupings. Sigh.
1728 		 */
1729 		if (pg->cmt_parent) {
1730 			/*
1731 			 * Determine if cmt_parent is in this lineage
1732 			 */
1733 			for (j = 0; j < size; j++) {
1734 				pg_tmp = lineage[j];
1735 				if (pg_tmp == pg->cmt_parent)
1736 					break;
1737 			}
1738 			if (pg_tmp != pg->cmt_parent) {
1739 				/*
1740 				 * cmt_parent is not in the lineage, verify
1741 				 * it is a proper subset of PG.
1742 				 */
1743 				if (PG_NUM_CPUS((pg_t *)pg->cmt_parent) >=
1744 				    PG_NUM_CPUS((pg_t *)pg)) {
1745 					/*
1746 					 * Not a proper subset if pg has less
1747 					 * CPUs than cmt_parent...
1748 					 */
1749 					cmt_lineage_status =
1750 					    CMT_LINEAGE_NON_PROMOTABLE;
1751 					goto handle_error;
1752 				}
1753 			}
1754 		}
1755 
1756 		/*
1757 		 * Walk each of the CPUs in the PGs group and perform
1758 		 * consistency checks along the way.
1759 		 */
1760 		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
1761 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1762 			/*
1763 			 * Verify that there aren't any CPUs contained in PG
1764 			 * that the next PG in the lineage (which is larger
1765 			 * or same size) doesn't also contain.
1766 			 */
1767 			if (pg_next != NULL &&
1768 			    pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
1769 				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
1770 				goto handle_error;
1771 			}
1772 
1773 			/*
1774 			 * Verify that all the CPUs in the PG are in the same
1775 			 * lgroup.
1776 			 */
1777 			if (lgrp == LGRP_NULL_HANDLE) {
1778 				lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
1779 			} else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
1780 				cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
1781 				goto handle_error;
1782 			}
1783 		}
1784 	}
1785 
1786 handle_error:
1787 	/*
1788 	 * Some of these validation errors can result when the CPU grouping
1789 	 * information is derived from buggy sources (for example, incorrect
1790 	 * ACPI tables on x86 systems).
1791 	 *
1792 	 * We'll try to recover in such cases by pruning out the illegal
1793 	 * groupings from the PG hierarchy, which means that we won't optimize
1794 	 * for those levels, but we will for the remaining ones.
1795 	 */
1796 	switch (cmt_lineage_status) {
1797 	case CMT_LINEAGE_VALID:
1798 	case CMT_LINEAGE_REPAIRED:
1799 		break;
1800 	case CMT_LINEAGE_PG_SPANS_LGRPS:
1801 		/*
1802 		 * We've detected a PG whose CPUs span lgroups.
1803 		 *
1804 		 * This isn't supported, as the dispatcher isn't allowed to
1805 		 * to do CMT thread placement across lgroups, as this would
1806 		 * conflict with policies implementing MPO thread affinity.
1807 		 *
1808 		 * If the PG is of a sharing relationship type known to
1809 		 * legitimately span lgroups, specify that no CMT thread
1810 		 * placement policy should be implemented, and prune the PG
1811 		 * from the existing CMT PG hierarchy.
1812 		 *
1813 		 * Otherwise, fall though to the case below for handling.
1814 		 */
1815 		if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) {
1816 			if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1817 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1818 				goto revalidate;
1819 			}
1820 		}
1821 		/*LINTED*/
1822 	case CMT_LINEAGE_NON_PROMOTABLE:
1823 		/*
1824 		 * We've detected a PG that already exists in another CPU's
1825 		 * lineage that cannot cannot legally be promoted into place
1826 		 * without breaking the invariants of the hierarchy.
1827 		 */
1828 		if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1829 			if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1830 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1831 				goto revalidate;
1832 			}
1833 		}
1834 		/*
1835 		 * Something went wrong trying to prune out the bad level.
1836 		 * Disable CMT scheduling altogether.
1837 		 */
1838 		pg_cmt_disable();
1839 		break;
1840 	case CMT_LINEAGE_NON_CONCENTRIC:
1841 		/*
1842 		 * We've detected a non-concentric PG lineage, which means that
1843 		 * there's a PG in the lineage that has CPUs that the next PG
1844 		 * over in the lineage (which is the same size or larger)
1845 		 * doesn't have.
1846 		 *
1847 		 * In this case, we examine the two PGs to see if either
1848 		 * grouping is defined by potentially buggy sources.
1849 		 *
1850 		 * If one has less CPUs than the other, and contains CPUs
1851 		 * not found in the parent, and it is an untrusted enumeration,
1852 		 * then prune it. If both have the same number of CPUs, then
1853 		 * prune the one that is untrusted.
1854 		 *
1855 		 * This process repeats until we have a concentric lineage,
1856 		 * or we would have to prune out level derived from what we
1857 		 * thought was a reliable source, in which case CMT scheduling
1858 		 * is disabled altogether.
1859 		 */
1860 		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
1861 		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
1862 			pg_bad = pg;
1863 		} else if (PG_NUM_CPUS((pg_t *)pg) ==
1864 		    PG_NUM_CPUS((pg_t *)pg_next)) {
1865 			if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
1866 				pg_bad = pg_next;
1867 			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1868 				pg_bad = pg;
1869 			}
1870 		}
1871 		if (pg_bad) {
1872 			if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
1873 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1874 				goto revalidate;
1875 			}
1876 		}
1877 		/*
1878 		 * Something went wrong trying to identify and/or prune out
1879 		 * the bad level. Disable CMT scheduling altogether.
1880 		 */
1881 		pg_cmt_disable();
1882 		break;
1883 	default:
1884 		/*
1885 		 * If we're here, we've encountered a validation error for
1886 		 * which we don't know how to recover. In this case, disable
1887 		 * CMT scheduling altogether.
1888 		 */
1889 		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1890 		pg_cmt_disable();
1891 	}
1892 	return (cmt_lineage_status);
1893 }
1894