xref: /titanic_52/usr/src/uts/common/disp/cmt.c (revision 5dfd244acc8f144280c5bc8f69ed941185fc3ccc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/thread.h>
30 #include <sys/cpuvar.h>
31 #include <sys/cpupart.h>
32 #include <sys/kmem.h>
33 #include <sys/cmn_err.h>
34 #include <sys/kstat.h>
35 #include <sys/processor.h>
36 #include <sys/disp.h>
37 #include <sys/group.h>
38 #include <sys/pghw.h>
39 #include <sys/bitset.h>
40 #include <sys/lgrp.h>
41 #include <sys/cmt.h>
42 #include <sys/cpu_pm.h>
43 
44 /*
45  * CMT scheduler / dispatcher support
46  *
47  * This file implements CMT scheduler support using Processor Groups.
48  * The CMT processor group class creates and maintains the CMT class
49  * specific processor group pg_cmt_t.
50  *
51  * ---------------------------- <-- pg_cmt_t *
52  * | pghw_t                   |
53  * ----------------------------
54  * | CMT class specific data  |
55  * | - hierarchy linkage      |
56  * | - CMT load balancing data|
57  * | - active CPU group/bitset|
58  * ----------------------------
59  *
60  * The scheduler/dispatcher leverages knowledge of the performance
61  * relevant CMT sharing relationships existing between cpus to implement
62  * optimized affinity, load balancing, and coalescence policies.
63  *
64  * Load balancing policy seeks to improve performance by minimizing
65  * contention over shared processor resources / facilities, Affinity
66  * policies seek to improve cache and TLB utilization. Coalescence
67  * policies improve resource utilization and ultimately power efficiency.
68  *
69  * The CMT PGs created by this class are already arranged into a
70  * hierarchy (which is done in the pghw layer). To implement the top-down
71  * CMT load balancing algorithm, the CMT PGs additionally maintain
72  * parent, child and sibling hierarchy relationships.
73  * Parent PGs always contain a superset of their children(s) resources,
74  * each PG can have at most one parent, and siblings are the group of PGs
75  * sharing the same parent.
76  *
77  * On NUMA systems, the CMT load balancing algorithm balances across the
78  * CMT PGs within their respective lgroups. On UMA based system, there
79  * exists a top level group of PGs to balance across. On NUMA systems multiple
80  * top level groups are instantiated, where the top level balancing begins by
81  * balancng across the CMT PGs within their respective (per lgroup) top level
82  * groups.
83  */
84 static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
85 static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
86 						/* used for null_proc_lpa */
87 cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */
88 
89 static int		is_cpu0 = 1; /* true if this is boot CPU context */
90 
91 /*
92  * Array of hardware sharing relationships that are blacklisted.
93  * PGs won't be instantiated for blacklisted hardware sharing relationships.
94  */
95 static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
96 
97 /*
98  * Set this to non-zero to disable CMT scheduling
99  * This must be done via kmdb -d, as /etc/system will be too late
100  */
101 int			cmt_sched_disabled = 0;
102 
103 /*
104  * Status codes for CMT lineage validation
105  * See pg_cmt_lineage_validate() below
106  */
107 typedef enum cmt_lineage_validation {
108 	CMT_LINEAGE_VALID,
109 	CMT_LINEAGE_NON_CONCENTRIC,
110 	CMT_LINEAGE_PG_SPANS_LGRPS,
111 	CMT_LINEAGE_NON_PROMOTABLE,
112 	CMT_LINEAGE_REPAIRED,
113 	CMT_LINEAGE_UNRECOVERABLE
114 } cmt_lineage_validation_t;
115 
116 /*
117  * Status of the current lineage under construction.
118  * One must be holding cpu_lock to change this.
119  */
120 cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;
121 
122 /*
123  * Power domain definitions (on x86) are defined by ACPI, and
124  * therefore may be subject to BIOS bugs.
125  */
126 #define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)
127 
128 /*
129  * Macro to test if PG is managed by the CMT PG class
130  */
131 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
132 
133 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
134 
135 static pg_t		*pg_cmt_alloc();
136 static void		pg_cmt_free(pg_t *);
137 static void		pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
138 static void		pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
139 static void		pg_cmt_cpu_active(cpu_t *);
140 static void		pg_cmt_cpu_inactive(cpu_t *);
141 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
142 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
143 static char		*pg_cmt_policy_name(pg_t *);
144 static void		pg_cmt_hier_sort(pg_cmt_t **, int);
145 static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
146 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
147 static int		pg_cmt_hw(pghw_type_t);
148 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
149 static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
150 static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
151 			    kthread_t *, kthread_t *);
152 static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
153 			    kthread_t *, kthread_t *);
154 static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
155 static cmt_lineage_validation_t	pg_cmt_lineage_validate(pg_cmt_t **, int *,
156 			    cpu_pg_t *);
157 
158 
159 /*
160  * CMT PG ops
161  */
162 struct pg_ops pg_ops_cmt = {
163 	pg_cmt_alloc,
164 	pg_cmt_free,
165 	pg_cmt_cpu_init,
166 	pg_cmt_cpu_fini,
167 	pg_cmt_cpu_active,
168 	pg_cmt_cpu_inactive,
169 	pg_cmt_cpupart_in,
170 	NULL,			/* cpupart_out */
171 	pg_cmt_cpupart_move,
172 	pg_cmt_cpu_belongs,
173 	pg_cmt_policy_name,
174 };
175 
176 /*
177  * Initialize the CMT PG class
178  */
179 void
180 pg_cmt_class_init(void)
181 {
182 	if (cmt_sched_disabled)
183 		return;
184 
185 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
186 }
187 
188 /*
189  * Called to indicate a new CPU has started up so
190  * that either t0 or the slave startup thread can
191  * be accounted for.
192  */
193 void
194 pg_cmt_cpu_startup(cpu_t *cp)
195 {
196 	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
197 	    cp->cpu_thread);
198 }
199 
200 /*
201  * Return non-zero if thread can migrate between "from" and "to"
202  * without a performance penalty
203  */
204 int
205 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
206 {
207 	if (from->cpu_physid->cpu_cacheid ==
208 	    to->cpu_physid->cpu_cacheid)
209 		return (1);
210 	return (0);
211 }
212 
213 /*
214  * CMT class specific PG allocation
215  */
216 static pg_t *
217 pg_cmt_alloc(void)
218 {
219 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
220 }
221 
222 /*
223  * Class specific PG de-allocation
224  */
225 static void
226 pg_cmt_free(pg_t *pg)
227 {
228 	ASSERT(pg != NULL);
229 	ASSERT(IS_CMT_PG(pg));
230 
231 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
232 }
233 
234 /*
235  * Given a hardware sharing relationship, return which dispatcher
236  * policies should be implemented to optimize performance and efficiency
237  */
238 static pg_cmt_policy_t
239 pg_cmt_policy(pghw_type_t hw)
240 {
241 	pg_cmt_policy_t p;
242 
243 	/*
244 	 * Give the platform a chance to override the default
245 	 */
246 	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
247 		return (p);
248 
249 	switch (hw) {
250 	case PGHW_IPIPE:
251 	case PGHW_FPU:
252 	case PGHW_CHIP:
253 		return (CMT_BALANCE);
254 	case PGHW_CACHE:
255 		return (CMT_AFFINITY);
256 	case PGHW_POW_ACTIVE:
257 	case PGHW_POW_IDLE:
258 		return (CMT_BALANCE);
259 	default:
260 		return (CMT_NO_POLICY);
261 	}
262 }
263 
264 /*
265  * Rank the importance of optimizing for the pg1 relationship vs.
266  * the pg2 relationship.
267  */
268 static pg_cmt_t *
269 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
270 {
271 	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
272 	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
273 
274 	/*
275 	 * A power domain is only important if CPUPM is enabled.
276 	 */
277 	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
278 		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
279 			return (pg2);
280 		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
281 			return (pg1);
282 	}
283 
284 	/*
285 	 * Otherwise, ask the platform
286 	 */
287 	if (pg_plat_hw_rank(hw1, hw2) == hw1)
288 		return (pg1);
289 	else
290 		return (pg2);
291 }
292 
293 /*
294  * Initialize CMT callbacks for the given PG
295  */
296 static void
297 cmt_callback_init(pg_t *pg)
298 {
299 	switch (((pghw_t *)pg)->pghw_hw) {
300 	case PGHW_POW_ACTIVE:
301 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
302 		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
303 		break;
304 	default:
305 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
306 
307 	}
308 }
309 
310 /*
311  * Promote PG above it's current parent.
312  * This is only legal if PG has an equal or greater number of CPUs than its
313  * parent.
314  *
315  * This routine operates on the CPU specific processor group data (for the CPUs
316  * in the PG being promoted), and may be invoked from a context where one CPU's
317  * PG data is under construction. In this case the argument "pgdata", if not
318  * NULL, is a reference to the CPU's under-construction PG data.
319  */
320 static void
321 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
322 {
323 	pg_cmt_t	*parent;
324 	group_t		*children;
325 	cpu_t		*cpu;
326 	group_iter_t	iter;
327 	pg_cpu_itr_t	cpu_iter;
328 	int		r;
329 	int		err;
330 
331 	ASSERT(MUTEX_HELD(&cpu_lock));
332 
333 	parent = pg->cmt_parent;
334 	if (parent == NULL) {
335 		/*
336 		 * Nothing to do
337 		 */
338 		return;
339 	}
340 
341 	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
342 
343 	/*
344 	 * We're changing around the hierarchy, which is actively traversed
345 	 * by the dispatcher. Pause CPUS to ensure exclusivity.
346 	 */
347 	pause_cpus(NULL);
348 
349 	/*
350 	 * If necessary, update the parent's sibling set, replacing parent
351 	 * with PG.
352 	 */
353 	if (parent->cmt_siblings) {
354 		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
355 		    != -1) {
356 			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
357 			ASSERT(r != -1);
358 		}
359 	}
360 
361 	/*
362 	 * If the parent is at the top of the hierarchy, replace it's entry
363 	 * in the root lgroup's group of top level PGs.
364 	 */
365 	if (parent->cmt_parent == NULL &&
366 	    parent->cmt_siblings != &cmt_root->cl_pgs) {
367 		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
368 		    != -1) {
369 			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
370 			ASSERT(r != -1);
371 		}
372 	}
373 
374 	/*
375 	 * We assume (and therefore assert) that the PG being promoted is an
376 	 * only child of it's parent. Update the parent's children set
377 	 * replacing PG's entry with the parent (since the parent is becoming
378 	 * the child). Then have PG and the parent swap children sets.
379 	 */
380 	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
381 	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
382 		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
383 		ASSERT(r != -1);
384 	}
385 
386 	children = pg->cmt_children;
387 	pg->cmt_children = parent->cmt_children;
388 	parent->cmt_children = children;
389 
390 	/*
391 	 * Update the sibling references for PG and it's parent
392 	 */
393 	pg->cmt_siblings = parent->cmt_siblings;
394 	parent->cmt_siblings = pg->cmt_children;
395 
396 	/*
397 	 * Update any cached lineages in the per CPU pg data.
398 	 */
399 	PG_CPU_ITR_INIT(pg, cpu_iter);
400 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
401 		int		idx;
402 		pg_cmt_t	*cpu_pg;
403 		cpu_pg_t	*pgd;	/* CPU's PG data */
404 
405 		/*
406 		 * The CPU's whose lineage is under construction still
407 		 * references the bootstrap CPU PG data structure.
408 		 */
409 		if (pg_cpu_is_bootstrapped(cpu))
410 			pgd = pgdata;
411 		else
412 			pgd = cpu->cpu_pg;
413 
414 		/*
415 		 * Iterate over the CPU's PGs updating the children
416 		 * of the PG being promoted, since they have a new parent.
417 		 */
418 		group_iter_init(&iter);
419 		while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
420 			if (cpu_pg->cmt_parent == pg) {
421 				cpu_pg->cmt_parent = parent;
422 			}
423 		}
424 
425 		/*
426 		 * Update the CMT load balancing lineage
427 		 */
428 		if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
429 			/*
430 			 * Unless this is the CPU who's lineage is being
431 			 * constructed, the PG being promoted should be
432 			 * in the lineage.
433 			 */
434 			ASSERT(pg_cpu_is_bootstrapped(cpu));
435 			continue;
436 		}
437 
438 		ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);
439 		ASSERT(idx > 0);
440 
441 		/*
442 		 * Have the child and the parent swap places in the CPU's
443 		 * lineage
444 		 */
445 		group_remove_at(&pgd->cmt_pgs, idx);
446 		group_remove_at(&pgd->cmt_pgs, idx - 1);
447 		err = group_add_at(&pgd->cmt_pgs, parent, idx);
448 		ASSERT(err == 0);
449 		err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
450 		ASSERT(err == 0);
451 	}
452 
453 	/*
454 	 * Update the parent references for PG and it's parent
455 	 */
456 	pg->cmt_parent = parent->cmt_parent;
457 	parent->cmt_parent = pg;
458 
459 	start_cpus();
460 }
461 
462 /*
463  * CMT class callback for a new CPU entering the system
464  *
465  * This routine operates on the CPU specific processor group data (for the CPU
466  * being initialized). The argument "pgdata" is a reference to the CPU's PG
467  * data to be constructed.
468  *
469  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
470  * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
471  * calls must be careful to operate only on the "pgdata" argument, and not
472  * cp->cpu_pg.
473  */
474 static void
475 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
476 {
477 	pg_cmt_t	*pg;
478 	group_t		*cmt_pgs;
479 	int		levels, level;
480 	pghw_type_t	hw;
481 	pg_t		*pg_cache = NULL;
482 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
483 	lgrp_handle_t	lgrp_handle;
484 	cmt_lgrp_t	*lgrp;
485 	cmt_lineage_validation_t	lineage_status;
486 
487 	ASSERT(MUTEX_HELD(&cpu_lock));
488 	ASSERT(pg_cpu_is_bootstrapped(cp));
489 
490 	if (cmt_sched_disabled)
491 		return;
492 
493 	/*
494 	 * A new CPU is coming into the system.
495 	 * Interrogate the platform to see if the CPU
496 	 * has any performance or efficiency relevant
497 	 * sharing relationships
498 	 */
499 	cmt_pgs = &pgdata->cmt_pgs;
500 	pgdata->cmt_lineage = NULL;
501 
502 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
503 	levels = 0;
504 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
505 
506 		pg_cmt_policy_t	policy;
507 
508 		/*
509 		 * We're only interested in the hw sharing relationships
510 		 * for which we know how to optimize.
511 		 */
512 		policy = pg_cmt_policy(hw);
513 		if (policy == CMT_NO_POLICY ||
514 		    pg_plat_hw_shared(cp, hw) == 0)
515 			continue;
516 
517 		/*
518 		 * Continue if the hardware sharing relationship has been
519 		 * blacklisted.
520 		 */
521 		if (cmt_hw_blacklisted[hw]) {
522 			continue;
523 		}
524 
525 		/*
526 		 * Find (or create) the PG associated with
527 		 * the hw sharing relationship in which cp
528 		 * belongs.
529 		 *
530 		 * Determine if a suitable PG already
531 		 * exists, or if one needs to be created.
532 		 */
533 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
534 		if (pg == NULL) {
535 			/*
536 			 * Create a new one.
537 			 * Initialize the common...
538 			 */
539 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
540 
541 			/* ... physical ... */
542 			pghw_init((pghw_t *)pg, cp, hw);
543 
544 			/*
545 			 * ... and CMT specific portions of the
546 			 * structure.
547 			 */
548 			pg->cmt_policy = policy;
549 
550 			/* CMT event callbacks */
551 			cmt_callback_init((pg_t *)pg);
552 
553 			bitset_init(&pg->cmt_cpus_actv_set);
554 			group_create(&pg->cmt_cpus_actv);
555 		} else {
556 			ASSERT(IS_CMT_PG(pg));
557 		}
558 
559 		/* Add the CPU to the PG */
560 		pg_cpu_add((pg_t *)pg, cp, pgdata);
561 
562 		/*
563 		 * Ensure capacity of the active CPU group/bitset
564 		 */
565 		group_expand(&pg->cmt_cpus_actv,
566 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
567 
568 		if (cp->cpu_seqid >=
569 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
570 			bitset_resize(&pg->cmt_cpus_actv_set,
571 			    cp->cpu_seqid + 1);
572 		}
573 
574 		/*
575 		 * Build a lineage of CMT PGs for load balancing / coalescence
576 		 */
577 		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
578 			cpu_cmt_hier[levels++] = pg;
579 		}
580 
581 		/* Cache this for later */
582 		if (hw == PGHW_CACHE)
583 			pg_cache = (pg_t *)pg;
584 	}
585 
586 	group_expand(cmt_pgs, levels);
587 
588 	if (cmt_root == NULL)
589 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
590 
591 	/*
592 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
593 	 */
594 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
595 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
596 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
597 
598 	/*
599 	 * Ascendingly sort the PGs in the lineage by number of CPUs
600 	 */
601 	pg_cmt_hier_sort(cpu_cmt_hier, levels);
602 
603 	/*
604 	 * Examine the lineage and validate it.
605 	 * This routine will also try to fix the lineage along with the
606 	 * rest of the PG hierarchy should it detect an issue.
607 	 *
608 	 * If it returns anything other than VALID or REPAIRED, an
609 	 * unrecoverable error has occurred, and we cannot proceed.
610 	 */
611 	lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
612 	if ((lineage_status != CMT_LINEAGE_VALID) &&
613 	    (lineage_status != CMT_LINEAGE_REPAIRED)) {
614 		/*
615 		 * In the case of an unrecoverable error where CMT scheduling
616 		 * has been disabled, assert that the under construction CPU's
617 		 * PG data has an empty CMT load balancing lineage.
618 		 */
619 		ASSERT((cmt_sched_disabled == 0) ||
620 		    (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
621 		return;
622 	}
623 
624 	/*
625 	 * For existing PGs in the lineage, verify that the parent is
626 	 * correct, as the generation in the lineage may have changed
627 	 * as a result of the sorting. Start the traversal at the top
628 	 * of the lineage, moving down.
629 	 */
630 	for (level = levels - 1; level >= 0; ) {
631 		int reorg;
632 
633 		reorg = 0;
634 		pg = cpu_cmt_hier[level];
635 
636 		/*
637 		 * Promote PGs at an incorrect generation into place.
638 		 */
639 		while (pg->cmt_parent &&
640 		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
641 			cmt_hier_promote(pg, pgdata);
642 			reorg++;
643 		}
644 		if (reorg > 0)
645 			level = levels - 1;
646 		else
647 			level--;
648 	}
649 
650 	/*
651 	 * For each of the PGs in the CPU's lineage:
652 	 *	- Add an entry in the CPU sorted CMT PG group
653 	 *	  which is used for top down CMT load balancing
654 	 *	- Tie the PG into the CMT hierarchy by connecting
655 	 *	  it to it's parent and siblings.
656 	 */
657 	for (level = 0; level < levels; level++) {
658 		uint_t		children;
659 		int		err;
660 
661 		pg = cpu_cmt_hier[level];
662 		err = group_add_at(cmt_pgs, pg, levels - level - 1);
663 		ASSERT(err == 0);
664 
665 		if (level == 0)
666 			pgdata->cmt_lineage = (pg_t *)pg;
667 
668 		if (pg->cmt_siblings != NULL) {
669 			/* Already initialized */
670 			ASSERT(pg->cmt_parent == NULL ||
671 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
672 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
673 			    ((pg->cmt_parent != NULL) &&
674 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
675 			continue;
676 		}
677 
678 		if ((level + 1) == levels) {
679 			pg->cmt_parent = NULL;
680 
681 			pg->cmt_siblings = &lgrp->cl_pgs;
682 			children = ++lgrp->cl_npgs;
683 			if (cmt_root != lgrp)
684 				cmt_root->cl_npgs++;
685 		} else {
686 			pg->cmt_parent = cpu_cmt_hier[level + 1];
687 
688 			/*
689 			 * A good parent keeps track of their children.
690 			 * The parent's children group is also the PG's
691 			 * siblings.
692 			 */
693 			if (pg->cmt_parent->cmt_children == NULL) {
694 				pg->cmt_parent->cmt_children =
695 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
696 				group_create(pg->cmt_parent->cmt_children);
697 			}
698 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
699 			children = ++pg->cmt_parent->cmt_nchildren;
700 		}
701 
702 		group_expand(pg->cmt_siblings, children);
703 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
704 	}
705 
706 	/*
707 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
708 	 * for fast lookups later.
709 	 */
710 	if (cp->cpu_physid) {
711 		cp->cpu_physid->cpu_chipid =
712 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
713 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
714 
715 		/*
716 		 * If this cpu has a PG representing shared cache, then set
717 		 * cpu_cacheid to that PG's logical id
718 		 */
719 		if (pg_cache)
720 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
721 	}
722 
723 	/* CPU0 only initialization */
724 	if (is_cpu0) {
725 		pg_cmt_cpu_startup(cp);
726 		is_cpu0 = 0;
727 		cpu0_lgrp = lgrp;
728 	}
729 
730 }
731 
732 /*
733  * Class callback when a CPU is leaving the system (deletion)
734  *
735  * "pgdata" is a reference to the CPU's PG data to be deconstructed.
736  *
737  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
738  * references a "bootstrap" structure across this function's invocation.
739  * pg_cmt_cpu_init() and the routines it calls must be careful to operate only
740  * on the "pgdata" argument, and not cp->cpu_pg.
741  */
742 static void
743 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
744 {
745 	group_iter_t	i;
746 	pg_cmt_t	*pg;
747 	group_t		*pgs, *cmt_pgs;
748 	lgrp_handle_t	lgrp_handle;
749 	cmt_lgrp_t	*lgrp;
750 
751 	if (cmt_sched_disabled)
752 		return;
753 
754 	ASSERT(pg_cpu_is_bootstrapped(cp));
755 
756 	pgs = &pgdata->pgs;
757 	cmt_pgs = &pgdata->cmt_pgs;
758 
759 	/*
760 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
761 	 */
762 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
763 
764 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
765 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
766 		/*
767 		 * One might wonder how we could be deconfiguring the
768 		 * only CPU in the system.
769 		 *
770 		 * On Starcat systems when null_proc_lpa is detected,
771 		 * the boot CPU (which is already configured into a leaf
772 		 * lgroup), is moved into the root lgroup. This is done by
773 		 * deconfiguring it from both lgroups and processor
774 		 * groups), and then later reconfiguring it back in.  This
775 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
776 		 *
777 		 * This special case is detected by noting that the platform
778 		 * has changed the CPU's lgrp affiliation (since it now
779 		 * belongs in the root). In this case, use the cmt_lgrp_t
780 		 * cached for the boot CPU, since this is what needs to be
781 		 * torn down.
782 		 */
783 		lgrp = cpu0_lgrp;
784 	}
785 
786 	ASSERT(lgrp != NULL);
787 
788 	/*
789 	 * First, clean up anything load balancing specific for each of
790 	 * the CPU's PGs that participated in CMT load balancing
791 	 */
792 	pg = (pg_cmt_t *)pgdata->cmt_lineage;
793 	while (pg != NULL) {
794 
795 		/*
796 		 * Remove the PG from the CPU's load balancing lineage
797 		 */
798 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
799 
800 		/*
801 		 * If it's about to become empty, destroy it's children
802 		 * group, and remove it's reference from it's siblings.
803 		 * This is done here (rather than below) to avoid removing
804 		 * our reference from a PG that we just eliminated.
805 		 */
806 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
807 			if (pg->cmt_children != NULL)
808 				group_destroy(pg->cmt_children);
809 			if (pg->cmt_siblings != NULL) {
810 				if (pg->cmt_siblings == &lgrp->cl_pgs)
811 					lgrp->cl_npgs--;
812 				else
813 					pg->cmt_parent->cmt_nchildren--;
814 			}
815 		}
816 		pg = pg->cmt_parent;
817 	}
818 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
819 
820 	/*
821 	 * Now that the load balancing lineage updates have happened,
822 	 * remove the CPU from all it's PGs (destroying any that become
823 	 * empty).
824 	 */
825 	group_iter_init(&i);
826 	while ((pg = group_iterate(pgs, &i)) != NULL) {
827 		if (IS_CMT_PG(pg) == 0)
828 			continue;
829 
830 		pg_cpu_delete((pg_t *)pg, cp, pgdata);
831 		/*
832 		 * Deleting the CPU from the PG changes the CPU's
833 		 * PG group over which we are actively iterating
834 		 * Re-initialize the iteration
835 		 */
836 		group_iter_init(&i);
837 
838 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
839 
840 			/*
841 			 * The PG has become zero sized, so destroy it.
842 			 */
843 			group_destroy(&pg->cmt_cpus_actv);
844 			bitset_fini(&pg->cmt_cpus_actv_set);
845 			pghw_fini((pghw_t *)pg);
846 
847 			pg_destroy((pg_t *)pg);
848 		}
849 	}
850 }
851 
852 /*
853  * Class callback when a CPU is entering a cpu partition
854  */
855 static void
856 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
857 {
858 	group_t		*pgs;
859 	pg_t		*pg;
860 	group_iter_t	i;
861 
862 	ASSERT(MUTEX_HELD(&cpu_lock));
863 
864 	if (cmt_sched_disabled)
865 		return;
866 
867 	pgs = &cp->cpu_pg->pgs;
868 
869 	/*
870 	 * Ensure that the new partition's PG bitset
871 	 * is large enough for all CMT PG's to which cp
872 	 * belongs
873 	 */
874 	group_iter_init(&i);
875 	while ((pg = group_iterate(pgs, &i)) != NULL) {
876 		if (IS_CMT_PG(pg) == 0)
877 			continue;
878 
879 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
880 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
881 	}
882 }
883 
884 /*
885  * Class callback when a CPU is actually moving partitions
886  */
887 static void
888 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
889 {
890 	cpu_t		*cpp;
891 	group_t		*pgs;
892 	pg_t		*pg;
893 	group_iter_t	pg_iter;
894 	pg_cpu_itr_t	cpu_iter;
895 	boolean_t	found;
896 
897 	ASSERT(MUTEX_HELD(&cpu_lock));
898 
899 	if (cmt_sched_disabled)
900 		return;
901 
902 	pgs = &cp->cpu_pg->pgs;
903 	group_iter_init(&pg_iter);
904 
905 	/*
906 	 * Iterate over the CPUs CMT PGs
907 	 */
908 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
909 
910 		if (IS_CMT_PG(pg) == 0)
911 			continue;
912 
913 		/*
914 		 * Add the PG to the bitset in the new partition.
915 		 */
916 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
917 
918 		/*
919 		 * Remove the PG from the bitset in the old partition
920 		 * if the last of the PG's CPUs have left.
921 		 */
922 		found = B_FALSE;
923 		PG_CPU_ITR_INIT(pg, cpu_iter);
924 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
925 			if (cpp == cp)
926 				continue;
927 			if (CPU_ACTIVE(cpp) &&
928 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
929 				found = B_TRUE;
930 				break;
931 			}
932 		}
933 		if (!found)
934 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
935 	}
936 }
937 
938 /*
939  * Class callback when a CPU becomes active (online)
940  *
941  * This is called in a context where CPUs are paused
942  */
943 static void
944 pg_cmt_cpu_active(cpu_t *cp)
945 {
946 	int		err;
947 	group_iter_t	i;
948 	pg_cmt_t	*pg;
949 	group_t		*pgs;
950 
951 	ASSERT(MUTEX_HELD(&cpu_lock));
952 
953 	if (cmt_sched_disabled)
954 		return;
955 
956 	pgs = &cp->cpu_pg->pgs;
957 	group_iter_init(&i);
958 
959 	/*
960 	 * Iterate over the CPU's PGs
961 	 */
962 	while ((pg = group_iterate(pgs, &i)) != NULL) {
963 
964 		if (IS_CMT_PG(pg) == 0)
965 			continue;
966 
967 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
968 		ASSERT(err == 0);
969 
970 		/*
971 		 * If this is the first active CPU in the PG, and it
972 		 * represents a hardware sharing relationship over which
973 		 * CMT load balancing is performed, add it as a candidate
974 		 * for balancing with it's siblings.
975 		 */
976 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
977 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
978 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
979 			ASSERT(err == 0);
980 
981 			/*
982 			 * If this is a top level PG, add it as a balancing
983 			 * candidate when balancing within the root lgroup.
984 			 */
985 			if (pg->cmt_parent == NULL &&
986 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
987 				err = group_add(&cmt_root->cl_pgs, pg,
988 				    GRP_NORESIZE);
989 				ASSERT(err == 0);
990 			}
991 		}
992 
993 		/*
994 		 * Notate the CPU in the PGs active CPU bitset.
995 		 * Also notate the PG as being active in it's associated
996 		 * partition
997 		 */
998 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
999 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
1000 	}
1001 }
1002 
1003 /*
1004  * Class callback when a CPU goes inactive (offline)
1005  *
1006  * This is called in a context where CPUs are paused
1007  */
1008 static void
1009 pg_cmt_cpu_inactive(cpu_t *cp)
1010 {
1011 	int		err;
1012 	group_t		*pgs;
1013 	pg_cmt_t	*pg;
1014 	cpu_t		*cpp;
1015 	group_iter_t	i;
1016 	pg_cpu_itr_t	cpu_itr;
1017 	boolean_t	found;
1018 
1019 	ASSERT(MUTEX_HELD(&cpu_lock));
1020 
1021 	if (cmt_sched_disabled)
1022 		return;
1023 
1024 	pgs = &cp->cpu_pg->pgs;
1025 	group_iter_init(&i);
1026 
1027 	while ((pg = group_iterate(pgs, &i)) != NULL) {
1028 
1029 		if (IS_CMT_PG(pg) == 0)
1030 			continue;
1031 
1032 		/*
1033 		 * Remove the CPU from the CMT PGs active CPU group
1034 		 * bitmap
1035 		 */
1036 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1037 		ASSERT(err == 0);
1038 
1039 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1040 
1041 		/*
1042 		 * If there are no more active CPUs in this PG over which
1043 		 * load was balanced, remove it as a balancing candidate.
1044 		 */
1045 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
1046 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1047 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1048 			ASSERT(err == 0);
1049 
1050 			if (pg->cmt_parent == NULL &&
1051 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1052 				err = group_remove(&cmt_root->cl_pgs, pg,
1053 				    GRP_NORESIZE);
1054 				ASSERT(err == 0);
1055 			}
1056 		}
1057 
1058 		/*
1059 		 * Assert the number of active CPUs does not exceed
1060 		 * the total number of CPUs in the PG
1061 		 */
1062 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1063 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1064 
1065 		/*
1066 		 * Update the PG bitset in the CPU's old partition
1067 		 */
1068 		found = B_FALSE;
1069 		PG_CPU_ITR_INIT(pg, cpu_itr);
1070 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1071 			if (cpp == cp)
1072 				continue;
1073 			if (CPU_ACTIVE(cpp) &&
1074 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1075 				found = B_TRUE;
1076 				break;
1077 			}
1078 		}
1079 		if (!found) {
1080 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
1081 			    ((pg_t *)pg)->pg_id);
1082 		}
1083 	}
1084 }
1085 
1086 /*
1087  * Return non-zero if the CPU belongs in the given PG
1088  */
1089 static int
1090 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1091 {
1092 	cpu_t	*pg_cpu;
1093 
1094 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1095 
1096 	ASSERT(pg_cpu != NULL);
1097 
1098 	/*
1099 	 * The CPU belongs if, given the nature of the hardware sharing
1100 	 * relationship represented by the PG, the CPU has that
1101 	 * relationship with some other CPU already in the PG
1102 	 */
1103 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1104 		return (1);
1105 
1106 	return (0);
1107 }
1108 
1109 /*
1110  * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1111  */
1112 static void
1113 pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1114 {
1115 	int		i, j, inc;
1116 	pg_t		*tmp;
1117 	pg_t		**h = (pg_t **)hier;
1118 
1119 	/*
1120 	 * First sort by number of CPUs
1121 	 */
1122 	inc = size / 2;
1123 	while (inc > 0) {
1124 		for (i = inc; i < size; i++) {
1125 			j = i;
1126 			tmp = h[i];
1127 			while ((j >= inc) &&
1128 			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
1129 				h[j] = h[j - inc];
1130 				j = j - inc;
1131 			}
1132 			h[j] = tmp;
1133 		}
1134 		if (inc == 2)
1135 			inc = 1;
1136 		else
1137 			inc = (inc * 5) / 11;
1138 	}
1139 
1140 	/*
1141 	 * Break ties by asking the platform.
1142 	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
1143 	 */
1144 	for (i = 0; i < size - 1; i++) {
1145 		if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) &&
1146 		    pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) {
1147 			tmp = h[i];
1148 			h[i] = h[i + 1];
1149 			h[i + 1] = tmp;
1150 		}
1151 	}
1152 }
1153 
1154 /*
1155  * Return a cmt_lgrp_t * given an lgroup handle.
1156  */
1157 static cmt_lgrp_t *
1158 pg_cmt_find_lgrp(lgrp_handle_t hand)
1159 {
1160 	cmt_lgrp_t	*lgrp;
1161 
1162 	ASSERT(MUTEX_HELD(&cpu_lock));
1163 
1164 	lgrp = cmt_lgrps;
1165 	while (lgrp != NULL) {
1166 		if (lgrp->cl_hand == hand)
1167 			break;
1168 		lgrp = lgrp->cl_next;
1169 	}
1170 	return (lgrp);
1171 }
1172 
1173 /*
1174  * Create a cmt_lgrp_t with the specified handle.
1175  */
1176 static cmt_lgrp_t *
1177 pg_cmt_lgrp_create(lgrp_handle_t hand)
1178 {
1179 	cmt_lgrp_t	*lgrp;
1180 
1181 	ASSERT(MUTEX_HELD(&cpu_lock));
1182 
1183 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1184 
1185 	lgrp->cl_hand = hand;
1186 	lgrp->cl_npgs = 0;
1187 	lgrp->cl_next = cmt_lgrps;
1188 	cmt_lgrps = lgrp;
1189 	group_create(&lgrp->cl_pgs);
1190 
1191 	return (lgrp);
1192 }
1193 
1194 /*
1195  * Interfaces to enable and disable power aware dispatching
1196  * The caller must be holding cpu_lock.
1197  *
1198  * Return 0 on success and -1 on failure.
1199  */
1200 int
1201 cmt_pad_enable(pghw_type_t type)
1202 {
1203 	group_t		*hwset;
1204 	group_iter_t	iter;
1205 	pg_cmt_t	*pg;
1206 
1207 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1208 	ASSERT(MUTEX_HELD(&cpu_lock));
1209 
1210 	if ((hwset = pghw_set_lookup(type)) == NULL ||
1211 	    cmt_hw_blacklisted[type]) {
1212 		/*
1213 		 * Unable to find any instances of the specified type
1214 		 * of power domain, or the power domains have been blacklisted.
1215 		 */
1216 		return (-1);
1217 	}
1218 
1219 	/*
1220 	 * Iterate over the power domains, setting the default dispatcher
1221 	 * policy for power/performance optimization.
1222 	 *
1223 	 * Simply setting the policy isn't enough in the case where the power
1224 	 * domain is an only child of another PG. Because the dispatcher walks
1225 	 * the PG hierarchy in a top down fashion, the higher up PG's policy
1226 	 * will dominate. So promote the power domain above it's parent if both
1227 	 * PG and it's parent have the same CPUs to ensure it's policy
1228 	 * dominates.
1229 	 */
1230 	group_iter_init(&iter);
1231 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1232 		/*
1233 		 * If the power domain is an only child to a parent
1234 		 * not implementing the same policy, promote the child
1235 		 * above the parent to activate the policy.
1236 		 */
1237 		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
1238 		while ((pg->cmt_parent != NULL) &&
1239 		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
1240 		    (PG_NUM_CPUS((pg_t *)pg) ==
1241 		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
1242 			cmt_hier_promote(pg, NULL);
1243 		}
1244 	}
1245 
1246 	return (0);
1247 }
1248 
1249 int
1250 cmt_pad_disable(pghw_type_t type)
1251 {
1252 	group_t		*hwset;
1253 	group_iter_t	iter;
1254 	pg_cmt_t	*pg;
1255 	pg_cmt_t	*child;
1256 
1257 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1258 	ASSERT(MUTEX_HELD(&cpu_lock));
1259 
1260 	if ((hwset = pghw_set_lookup(type)) == NULL) {
1261 		/*
1262 		 * Unable to find any instances of the specified type of
1263 		 * power domain.
1264 		 */
1265 		return (-1);
1266 	}
1267 	/*
1268 	 * Iterate over the power domains, setting the default dispatcher
1269 	 * policy for performance optimization (load balancing).
1270 	 */
1271 	group_iter_init(&iter);
1272 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1273 
1274 		/*
1275 		 * If the power domain has an only child that implements
1276 		 * policy other than load balancing, promote the child
1277 		 * above the power domain to ensure it's policy dominates.
1278 		 */
1279 		if (pg->cmt_children != NULL &&
1280 		    GROUP_SIZE(pg->cmt_children) == 1) {
1281 			child = GROUP_ACCESS(pg->cmt_children, 0);
1282 			if ((child->cmt_policy & CMT_BALANCE) == 0) {
1283 				cmt_hier_promote(child, NULL);
1284 			}
1285 		}
1286 		pg->cmt_policy = CMT_BALANCE;
1287 	}
1288 	return (0);
1289 }
1290 
1291 /* ARGSUSED */
1292 static void
1293 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1294 		    kthread_t *new)
1295 {
1296 	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;
1297 
1298 	if (old == cp->cpu_idle_thread) {
1299 		atomic_add_32(&cmt_pg->cmt_utilization, 1);
1300 	} else if (new == cp->cpu_idle_thread) {
1301 		atomic_add_32(&cmt_pg->cmt_utilization, -1);
1302 	}
1303 }
1304 
1305 /*
1306  * Macro to test whether a thread is currently runnable on a CPU in a PG.
1307  */
1308 #define	THREAD_RUNNABLE_IN_PG(t, pg)					\
1309 	((t)->t_state == TS_RUN &&					\
1310 	    (t)->t_disp_queue->disp_cpu &&				\
1311 	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
1312 	    (t)->t_disp_queue->disp_cpu->cpu_seqid))
1313 
1314 static void
1315 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1316     kthread_t *new)
1317 {
1318 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1319 	cpupm_domain_t	*dom;
1320 	uint32_t	u;
1321 
1322 	if (old == cp->cpu_idle_thread) {
1323 		ASSERT(new != cp->cpu_idle_thread);
1324 		u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
1325 		if (u == 1) {
1326 			/*
1327 			 * Notify the CPU power manager that the domain
1328 			 * is non-idle.
1329 			 */
1330 			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1331 			cpupm_utilization_event(cp, now, dom,
1332 			    CPUPM_DOM_BUSY_FROM_IDLE);
1333 		}
1334 	} else if (new == cp->cpu_idle_thread) {
1335 		ASSERT(old != cp->cpu_idle_thread);
1336 		u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
1337 		if (u == 0) {
1338 			/*
1339 			 * The domain is idle, notify the CPU power
1340 			 * manager.
1341 			 *
1342 			 * Avoid notifying if the thread is simply migrating
1343 			 * between CPUs in the domain.
1344 			 */
1345 			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
1346 				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1347 				cpupm_utilization_event(cp, now, dom,
1348 				    CPUPM_DOM_IDLE_FROM_BUSY);
1349 			}
1350 		}
1351 	}
1352 }
1353 
1354 /* ARGSUSED */
1355 static void
1356 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
1357 {
1358 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1359 	cpupm_domain_t	*dom;
1360 
1361 	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1362 	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
1363 }
1364 
1365 /*
1366  * Return the name of the CMT scheduling policy
1367  * being implemented across this PG
1368  */
1369 static char *
1370 pg_cmt_policy_name(pg_t *pg)
1371 {
1372 	pg_cmt_policy_t policy;
1373 
1374 	policy = ((pg_cmt_t *)pg)->cmt_policy;
1375 
1376 	if (policy & CMT_AFFINITY) {
1377 		if (policy & CMT_BALANCE)
1378 			return ("Load Balancing & Affinity");
1379 		else if (policy & CMT_COALESCE)
1380 			return ("Load Coalescence & Affinity");
1381 		else
1382 			return ("Affinity");
1383 	} else {
1384 		if (policy & CMT_BALANCE)
1385 			return ("Load Balancing");
1386 		else if (policy & CMT_COALESCE)
1387 			return ("Load Coalescence");
1388 		else
1389 			return ("None");
1390 	}
1391 }
1392 
1393 /*
1394  * Prune PG, and all other instances of PG's hardware sharing relationship
1395  * from the PG hierarchy.
1396  *
1397  * This routine operates on the CPU specific processor group data (for the CPUs
1398  * in the PG being pruned), and may be invoked from a context where one CPU's
1399  * PG data is under construction. In this case the argument "pgdata", if not
1400  * NULL, is a reference to the CPU's under-construction PG data.
1401  */
1402 static int
1403 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1404 {
1405 	group_t		*hwset, *children;
1406 	int		i, j, r, size = *sz;
1407 	group_iter_t	hw_iter, child_iter;
1408 	pg_cpu_itr_t	cpu_iter;
1409 	pg_cmt_t	*pg, *child;
1410 	cpu_t		*cpu;
1411 	int		cap_needed;
1412 	pghw_type_t	hw;
1413 
1414 	ASSERT(MUTEX_HELD(&cpu_lock));
1415 
1416 	hw = ((pghw_t *)pg_bad)->pghw_hw;
1417 
1418 	if (hw == PGHW_POW_ACTIVE) {
1419 		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
1420 		    "Event Based CPUPM Unavailable");
1421 	} else if (hw == PGHW_POW_IDLE) {
1422 		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
1423 		    "Dispatcher assisted CPUPM disabled.");
1424 	}
1425 
1426 	/*
1427 	 * Find and eliminate the PG from the lineage.
1428 	 */
1429 	for (i = 0; i < size; i++) {
1430 		if (lineage[i] == pg_bad) {
1431 			for (j = i; j < size - 1; j++)
1432 				lineage[j] = lineage[j + 1];
1433 			*sz = size - 1;
1434 			break;
1435 		}
1436 	}
1437 
1438 	/*
1439 	 * We'll prune all instances of the hardware sharing relationship
1440 	 * represented by pg. But before we do that (and pause CPUs) we need
1441 	 * to ensure the hierarchy's groups are properly sized.
1442 	 */
1443 	hwset = pghw_set_lookup(hw);
1444 
1445 	/*
1446 	 * Blacklist the hardware so that future groups won't be created.
1447 	 */
1448 	cmt_hw_blacklisted[hw] = 1;
1449 
1450 	/*
1451 	 * For each of the PGs being pruned, ensure sufficient capacity in
1452 	 * the siblings set for the PG's children
1453 	 */
1454 	group_iter_init(&hw_iter);
1455 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1456 		/*
1457 		 * PG is being pruned, but if it is bringing up more than
1458 		 * one child, ask for more capacity in the siblings group.
1459 		 */
1460 		cap_needed = 0;
1461 		if (pg->cmt_children &&
1462 		    GROUP_SIZE(pg->cmt_children) > 1) {
1463 			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
1464 
1465 			group_expand(pg->cmt_siblings,
1466 			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);
1467 
1468 			/*
1469 			 * If this is a top level group, also ensure the
1470 			 * capacity in the root lgrp level CMT grouping.
1471 			 */
1472 			if (pg->cmt_parent == NULL &&
1473 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1474 				group_expand(&cmt_root->cl_pgs,
1475 				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1476 			}
1477 		}
1478 	}
1479 
1480 	/*
1481 	 * We're operating on the PG hierarchy. Pause CPUs to ensure
1482 	 * exclusivity with respect to the dispatcher.
1483 	 */
1484 	pause_cpus(NULL);
1485 
1486 	/*
1487 	 * Prune all PG instances of the hardware sharing relationship
1488 	 * represented by pg.
1489 	 */
1490 	group_iter_init(&hw_iter);
1491 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1492 
1493 		/*
1494 		 * Remove PG from it's group of siblings, if it's there.
1495 		 */
1496 		if (pg->cmt_siblings) {
1497 			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1498 		}
1499 		if (pg->cmt_parent == NULL &&
1500 		    pg->cmt_siblings != &cmt_root->cl_pgs) {
1501 			(void) group_remove(&cmt_root->cl_pgs, pg,
1502 			    GRP_NORESIZE);
1503 		}
1504 		/*
1505 		 * Move PG's children from it's children set to it's parent's
1506 		 * children set. Note that the parent's children set, and PG's
1507 		 * siblings set are the same thing.
1508 		 *
1509 		 * Because we are iterating over the same group that we are
1510 		 * operating on (removing the children), first add all of PG's
1511 		 * children to the parent's children set, and once we are done
1512 		 * iterating, empty PG's children set.
1513 		 */
1514 		if (pg->cmt_children != NULL) {
1515 			children = pg->cmt_children;
1516 
1517 			group_iter_init(&child_iter);
1518 			while ((child = group_iterate(children, &child_iter))
1519 			    != NULL) {
1520 				if (pg->cmt_siblings != NULL) {
1521 					r = group_add(pg->cmt_siblings, child,
1522 					    GRP_NORESIZE);
1523 					ASSERT(r == 0);
1524 				}
1525 			}
1526 			group_empty(pg->cmt_children);
1527 		}
1528 
1529 		/*
1530 		 * Reset the callbacks to the defaults
1531 		 */
1532 		pg_callback_set_defaults((pg_t *)pg);
1533 
1534 		/*
1535 		 * Update all the CPU lineages in each of PG's CPUs
1536 		 */
1537 		PG_CPU_ITR_INIT(pg, cpu_iter);
1538 		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1539 			pg_cmt_t	*cpu_pg;
1540 			group_iter_t	liter;	/* Iterator for the lineage */
1541 			cpu_pg_t	*cpd;	/* CPU's PG data */
1542 
1543 			/*
1544 			 * The CPU's lineage is under construction still
1545 			 * references the bootstrap CPU PG data structure.
1546 			 */
1547 			if (pg_cpu_is_bootstrapped(cpu))
1548 				cpd = pgdata;
1549 			else
1550 				cpd = cpu->cpu_pg;
1551 
1552 			/*
1553 			 * Iterate over the CPU's PGs updating the children
1554 			 * of the PG being promoted, since they have a new
1555 			 * parent and siblings set.
1556 			 */
1557 			group_iter_init(&liter);
1558 			while ((cpu_pg = group_iterate(&cpd->pgs,
1559 			    &liter)) != NULL) {
1560 				if (cpu_pg->cmt_parent == pg) {
1561 					cpu_pg->cmt_parent = pg->cmt_parent;
1562 					cpu_pg->cmt_siblings = pg->cmt_siblings;
1563 				}
1564 			}
1565 
1566 			/*
1567 			 * Update the CPU's lineages
1568 			 */
1569 			(void) group_remove(&cpd->pgs, pg, GRP_NORESIZE);
1570 			(void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
1571 		}
1572 	}
1573 	start_cpus();
1574 	return (0);
1575 }
1576 
1577 /*
1578  * Disable CMT scheduling
1579  */
1580 static void
1581 pg_cmt_disable(void)
1582 {
1583 	cpu_t		*cpu;
1584 
1585 	ASSERT(MUTEX_HELD(&cpu_lock));
1586 
1587 	pause_cpus(NULL);
1588 	cpu = cpu_list;
1589 
1590 	do {
1591 		if (cpu->cpu_pg)
1592 			group_empty(&cpu->cpu_pg->cmt_pgs);
1593 	} while ((cpu = cpu->cpu_next) != cpu_list);
1594 
1595 	cmt_sched_disabled = 1;
1596 	start_cpus();
1597 	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
1598 }
1599 
1600 /*
1601  * CMT lineage validation
1602  *
1603  * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
1604  * of the PGs in a CPU's lineage. This is necessary because it's possible that
1605  * some groupings (power domain groupings in particular) may be defined by
1606  * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
1607  * possible to integrate those groupings into the CMT PG hierarchy, if doing
1608  * so would violate the subset invariant of the hierarchy, which says that
1609  * a PG must be subset of its parent (if it has one).
1610  *
1611  * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
1612  * would result in a violation of this invariant. If a violation is found,
1613  * and the PG is of a grouping type who's definition is known to originate from
1614  * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
1615  * PG (and all other instances PG's sharing relationship type) from the
1616  * hierarchy. Further, future instances of that sharing relationship type won't
1617  * be instantiated. If the grouping definition doesn't originate from suspect
1618  * sources, then pg_cmt_disable() will be invoked to log an error, and disable
1619  * CMT scheduling altogether.
1620  *
1621  * This routine is invoked after the CPU has been added to the PGs in which
1622  * it belongs, but before those PGs have been added to (or had their place
1623  * adjusted in) the CMT PG hierarchy.
1624  *
1625  * The first argument is the CPUs PG lineage (essentially an array of PGs in
1626  * which the CPU belongs) that has already been sorted in ascending order
1627  * by CPU count. Some of the PGs in the CPUs lineage may already have other
1628  * CPUs in them, and have already been integrated into the CMT hierarchy.
1629  *
1630  * The addition of this new CPU to these pre-existing PGs means that those
1631  * PGs may need to be promoted up in the hierarchy to satisfy the subset
1632  * invariant. In additon to testing the subset invariant for the lineage,
1633  * this routine also verifies that the addition of the new CPU to the
1634  * existing PGs wouldn't cause the subset invariant to be violated in
1635  * the exiting lineages.
1636  *
1637  * This routine will normally return one of the following:
1638  * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
1639  * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
1640  *
1641  * Otherwise, this routine will return a value indicating which error it
1642  * was unable to recover from (and set cmt_lineage_status along the way).
1643  *
1644  *
1645  * This routine operates on the CPU specific processor group data (for the CPU
1646  * whose lineage is being validated), which is under-construction.
1647  * "pgdata" is a reference to the CPU's under-construction PG data.
1648  * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
1649  */
1650 static cmt_lineage_validation_t
1651 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1652 {
1653 	int		i, j, size;
1654 	pg_cmt_t	*pg, *pg_next, *pg_bad, *pg_tmp;
1655 	cpu_t		*cp;
1656 	pg_cpu_itr_t	cpu_iter;
1657 	lgrp_handle_t	lgrp;
1658 
1659 	ASSERT(MUTEX_HELD(&cpu_lock));
1660 
1661 revalidate:
1662 	size = *sz;
1663 	pg_bad = NULL;
1664 	lgrp = LGRP_NULL_HANDLE;
1665 	for (i = 0; i < size; i++) {
1666 
1667 		pg = lineage[i];
1668 		if (i < size - 1)
1669 			pg_next = lineage[i + 1];
1670 		else
1671 			pg_next = NULL;
1672 
1673 		/*
1674 		 * We assume that the lineage has already been sorted
1675 		 * by the number of CPUs. In fact, we depend on it.
1676 		 */
1677 		ASSERT(pg_next == NULL ||
1678 		    (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));
1679 
1680 		/*
1681 		 * Check to make sure that the existing parent of PG (if any)
1682 		 * is either in the PG's lineage, or the PG has more CPUs than
1683 		 * its existing parent and can and should be promoted above its
1684 		 * parent.
1685 		 *
1686 		 * Since the PG topology is in the middle of being changed, we
1687 		 * need to check whether the PG's existing parent (if any) is
1688 		 * part of its lineage (and therefore should contain the new
1689 		 * CPU). If not, it means that the addition of the new CPU
1690 		 * should have made this PG have more CPUs than its parent, and
1691 		 * this PG should be promoted to be above its existing parent
1692 		 * now. We need to verify all of this to defend against a buggy
1693 		 * BIOS giving bad power domain CPU groupings. Sigh.
1694 		 */
1695 		if (pg->cmt_parent) {
1696 			/*
1697 			 * Determine if cmt_parent is in this lineage
1698 			 */
1699 			for (j = 0; j < size; j++) {
1700 				pg_tmp = lineage[j];
1701 				if (pg_tmp == pg->cmt_parent)
1702 					break;
1703 			}
1704 			if (pg_tmp != pg->cmt_parent) {
1705 				/*
1706 				 * cmt_parent is not in the lineage, verify
1707 				 * it is a proper subset of PG.
1708 				 */
1709 				if (PG_NUM_CPUS((pg_t *)pg->cmt_parent) >=
1710 				    PG_NUM_CPUS((pg_t *)pg)) {
1711 					/*
1712 					 * Not a proper subset if pg has less
1713 					 * CPUs than cmt_parent...
1714 					 */
1715 					cmt_lineage_status =
1716 					    CMT_LINEAGE_NON_PROMOTABLE;
1717 					goto handle_error;
1718 				}
1719 			}
1720 		}
1721 
1722 		/*
1723 		 * Walk each of the CPUs in the PGs group and perform
1724 		 * consistency checks along the way.
1725 		 */
1726 		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
1727 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1728 			/*
1729 			 * Verify that there aren't any CPUs contained in PG
1730 			 * that the next PG in the lineage (which is larger
1731 			 * or same size) doesn't also contain.
1732 			 */
1733 			if (pg_next != NULL &&
1734 			    pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
1735 				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
1736 				goto handle_error;
1737 			}
1738 
1739 			/*
1740 			 * Verify that all the CPUs in the PG are in the same
1741 			 * lgroup.
1742 			 */
1743 			if (lgrp == LGRP_NULL_HANDLE) {
1744 				lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
1745 			} else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
1746 				cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
1747 				goto handle_error;
1748 			}
1749 		}
1750 	}
1751 
1752 handle_error:
1753 	/*
1754 	 * Some of these validation errors can result when the CPU grouping
1755 	 * information is derived from buggy sources (for example, incorrect
1756 	 * ACPI tables on x86 systems).
1757 	 *
1758 	 * We'll try to recover in such cases by pruning out the illegal
1759 	 * groupings from the PG hierarchy, which means that we won't optimize
1760 	 * for those levels, but we will for the remaining ones.
1761 	 */
1762 	switch (cmt_lineage_status) {
1763 	case CMT_LINEAGE_VALID:
1764 	case CMT_LINEAGE_REPAIRED:
1765 		break;
1766 	case CMT_LINEAGE_PG_SPANS_LGRPS:
1767 		/*
1768 		 * We've detected a PG whose CPUs span lgroups.
1769 		 *
1770 		 * This isn't supported, as the dispatcher isn't allowed to
1771 		 * to do CMT thread placement across lgroups, as this would
1772 		 * conflict with policies implementing MPO thread affinity.
1773 		 *
1774 		 * The handling for this falls through to the next case.
1775 		 */
1776 	case CMT_LINEAGE_NON_PROMOTABLE:
1777 		/*
1778 		 * We've detected a PG that already exists in another CPU's
1779 		 * lineage that cannot cannot legally be promoted into place
1780 		 * without breaking the invariants of the hierarchy.
1781 		 */
1782 		if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1783 			if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1784 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1785 				goto revalidate;
1786 			}
1787 		}
1788 		/*
1789 		 * Something went wrong trying to prune out the bad level.
1790 		 * Disable CMT scheduling altogether.
1791 		 */
1792 		pg_cmt_disable();
1793 		break;
1794 	case CMT_LINEAGE_NON_CONCENTRIC:
1795 		/*
1796 		 * We've detected a non-concentric PG lineage, which means that
1797 		 * there's a PG in the lineage that has CPUs that the next PG
1798 		 * over in the lineage (which is the same size or larger)
1799 		 * doesn't have.
1800 		 *
1801 		 * In this case, we examine the two PGs to see if either
1802 		 * grouping is defined by potentially buggy sources.
1803 		 *
1804 		 * If one has less CPUs than the other, and contains CPUs
1805 		 * not found in the parent, and it is an untrusted enumeration,
1806 		 * then prune it. If both have the same number of CPUs, then
1807 		 * prune the one that is untrusted.
1808 		 *
1809 		 * This process repeats until we have a concentric lineage,
1810 		 * or we would have to prune out level derived from what we
1811 		 * thought was a reliable source, in which case CMT scheduling
1812 		 * is disabled altogether.
1813 		 */
1814 		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
1815 		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
1816 			pg_bad = pg;
1817 		} else if (PG_NUM_CPUS((pg_t *)pg) ==
1818 		    PG_NUM_CPUS((pg_t *)pg_next)) {
1819 			if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
1820 				pg_bad = pg_next;
1821 			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1822 				pg_bad = pg;
1823 			}
1824 		}
1825 		if (pg_bad) {
1826 			if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
1827 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1828 				goto revalidate;
1829 			}
1830 		}
1831 		/*
1832 		 * Something went wrong trying to identify and/or prune out
1833 		 * the bad level. Disable CMT scheduling altogether.
1834 		 */
1835 		pg_cmt_disable();
1836 		break;
1837 	default:
1838 		/*
1839 		 * If we're here, we've encountered a validation error for
1840 		 * which we don't know how to recover. In this case, disable
1841 		 * CMT scheduling altogether.
1842 		 */
1843 		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1844 		pg_cmt_disable();
1845 	}
1846 	return (cmt_lineage_status);
1847 }
1848