xref: /illumos-gate/usr/src/uts/common/disp/cmt.c (revision c211fc479225fa54805cf480633bf6689ca9a2db)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/thread.h>
30 #include <sys/cpuvar.h>
31 #include <sys/cpupart.h>
32 #include <sys/kmem.h>
33 #include <sys/cmn_err.h>
34 #include <sys/kstat.h>
35 #include <sys/processor.h>
36 #include <sys/disp.h>
37 #include <sys/group.h>
38 #include <sys/pghw.h>
39 #include <sys/bitset.h>
40 #include <sys/lgrp.h>
41 #include <sys/cmt.h>
42 #include <sys/cpu_pm.h>
43 
44 /*
45  * CMT scheduler / dispatcher support
46  *
47  * This file implements CMT scheduler support using Processor Groups.
48  * The CMT processor group class creates and maintains the CMT class
49  * specific processor group pg_cmt_t.
50  *
51  * ---------------------------- <-- pg_cmt_t *
52  * | pghw_t                   |
53  * ----------------------------
54  * | CMT class specific data  |
55  * | - hierarchy linkage      |
56  * | - CMT load balancing data|
57  * | - active CPU group/bitset|
58  * ----------------------------
59  *
60  * The scheduler/dispatcher leverages knowledge of the performance
61  * relevant CMT sharing relationships existing between cpus to implement
62  * optimized affinity, load balancing, and coalescence policies.
63  *
64  * Load balancing policy seeks to improve performance by minimizing
65  * contention over shared processor resources / facilities, Affinity
66  * policies seek to improve cache and TLB utilization. Coalescence
67  * policies improve resource utilization and ultimately power efficiency.
68  *
69  * The CMT PGs created by this class are already arranged into a
70  * hierarchy (which is done in the pghw layer). To implement the top-down
71  * CMT load balancing algorithm, the CMT PGs additionally maintain
72  * parent, child and sibling hierarchy relationships.
73  * Parent PGs always contain a superset of their children(s) resources,
74  * each PG can have at most one parent, and siblings are the group of PGs
75  * sharing the same parent.
76  *
77  * On NUMA systems, the CMT load balancing algorithm balances across the
78  * CMT PGs within their respective lgroups. On UMA based system, there
79  * exists a top level group of PGs to balance across. On NUMA systems multiple
80  * top level groups are instantiated, where the top level balancing begins by
81  * balancng across the CMT PGs within their respective (per lgroup) top level
82  * groups.
83  */
84 static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
85 static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
86 						/* used for null_proc_lpa */
87 cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */
88 
89 static int		is_cpu0 = 1; /* true if this is boot CPU context */
90 
91 /*
92  * Array of hardware sharing relationships that are blacklisted.
93  * PGs won't be instantiated for blacklisted hardware sharing relationships.
94  */
95 static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
96 
97 /*
98  * Set this to non-zero to disable CMT scheduling
99  * This must be done via kmdb -d, as /etc/system will be too late
100  */
101 int			cmt_sched_disabled = 0;
102 
103 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
104 
105 static pg_t		*pg_cmt_alloc();
106 static void		pg_cmt_free(pg_t *);
107 static void		pg_cmt_cpu_init(cpu_t *);
108 static void		pg_cmt_cpu_fini(cpu_t *);
109 static void		pg_cmt_cpu_active(cpu_t *);
110 static void		pg_cmt_cpu_inactive(cpu_t *);
111 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
112 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
113 static char		*pg_cmt_policy_name(pg_t *);
114 static void		pg_cmt_hier_sort(pg_cmt_t **, int);
115 static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
116 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
117 static int		pg_cmt_hw(pghw_type_t);
118 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
119 static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
120 static int		pg_cmt_lineage_validate(pg_cmt_t **, int *);
121 static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
122 			    kthread_t *, kthread_t *);
123 static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
124 			    kthread_t *, kthread_t *);
125 static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
126 
127 /*
128  * Macro to test if PG is managed by the CMT PG class
129  */
130 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
131 
132 /*
133  * Status codes for CMT lineage validation
134  * See cmt_lineage_validate() below
135  */
136 typedef enum cmt_lineage_validation {
137 	CMT_LINEAGE_VALID,
138 	CMT_LINEAGE_NON_CONCENTRIC,
139 	CMT_LINEAGE_REPAIRED,
140 	CMT_LINEAGE_UNRECOVERABLE
141 } cmt_lineage_validation_t;
142 
143 /*
144  * Status of the current lineage under construction.
145  * One must be holding cpu_lock to change this.
146  */
147 static cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;
148 
149 /*
150  * Power domain definitions (on x86) are defined by ACPI, and
151  * therefore may be subject to BIOS bugs.
152  */
153 #define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)
154 
155 /*
156  * CMT PG ops
157  */
158 struct pg_ops pg_ops_cmt = {
159 	pg_cmt_alloc,
160 	pg_cmt_free,
161 	pg_cmt_cpu_init,
162 	pg_cmt_cpu_fini,
163 	pg_cmt_cpu_active,
164 	pg_cmt_cpu_inactive,
165 	pg_cmt_cpupart_in,
166 	NULL,			/* cpupart_out */
167 	pg_cmt_cpupart_move,
168 	pg_cmt_cpu_belongs,
169 	pg_cmt_policy_name,
170 };
171 
172 /*
173  * Initialize the CMT PG class
174  */
175 void
176 pg_cmt_class_init(void)
177 {
178 	if (cmt_sched_disabled)
179 		return;
180 
181 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
182 }
183 
184 /*
185  * Called to indicate a new CPU has started up so
186  * that either t0 or the slave startup thread can
187  * be accounted for.
188  */
189 void
190 pg_cmt_cpu_startup(cpu_t *cp)
191 {
192 	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
193 	    cp->cpu_thread);
194 }
195 
196 /*
197  * Return non-zero if thread can migrate between "from" and "to"
198  * without a performance penalty
199  */
200 int
201 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
202 {
203 	if (from->cpu_physid->cpu_cacheid ==
204 	    to->cpu_physid->cpu_cacheid)
205 		return (1);
206 	return (0);
207 }
208 
209 /*
210  * CMT class specific PG allocation
211  */
212 static pg_t *
213 pg_cmt_alloc(void)
214 {
215 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
216 }
217 
218 /*
219  * Class specific PG de-allocation
220  */
221 static void
222 pg_cmt_free(pg_t *pg)
223 {
224 	ASSERT(pg != NULL);
225 	ASSERT(IS_CMT_PG(pg));
226 
227 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
228 }
229 
230 /*
231  * Given a hardware sharing relationship, return which dispatcher
232  * policies should be implemented to optimize performance and efficiency
233  */
234 static pg_cmt_policy_t
235 pg_cmt_policy(pghw_type_t hw)
236 {
237 	pg_cmt_policy_t p;
238 
239 	/*
240 	 * Give the platform a chance to override the default
241 	 */
242 	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
243 		return (p);
244 
245 	switch (hw) {
246 	case PGHW_IPIPE:
247 	case PGHW_FPU:
248 	case PGHW_CHIP:
249 		return (CMT_BALANCE);
250 	case PGHW_CACHE:
251 		return (CMT_AFFINITY);
252 	case PGHW_POW_ACTIVE:
253 	case PGHW_POW_IDLE:
254 		return (CMT_BALANCE);
255 	default:
256 		return (CMT_NO_POLICY);
257 	}
258 }
259 
260 /*
261  * Rank the importance of optimizing for the pg1 relationship vs.
262  * the pg2 relationship.
263  */
264 static pg_cmt_t *
265 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
266 {
267 	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
268 	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
269 
270 	/*
271 	 * A power domain is only important if CPUPM is enabled.
272 	 */
273 	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
274 		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
275 			return (pg2);
276 		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
277 			return (pg1);
278 	}
279 
280 	/*
281 	 * Otherwise, ask the platform
282 	 */
283 	if (pg_plat_hw_rank(hw1, hw2) == hw1)
284 		return (pg1);
285 	else
286 		return (pg2);
287 }
288 
289 /*
290  * Initialize CMT callbacks for the given PG
291  */
292 static void
293 cmt_callback_init(pg_t *pg)
294 {
295 	switch (((pghw_t *)pg)->pghw_hw) {
296 	case PGHW_POW_ACTIVE:
297 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
298 		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
299 		break;
300 	default:
301 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
302 
303 	}
304 }
305 
306 /*
307  * Promote PG above it's current parent.
308  * This is only legal if PG has an equal or greater number of CPUs
309  * than it's parent.
310  */
311 static void
312 cmt_hier_promote(pg_cmt_t *pg)
313 {
314 	pg_cmt_t	*parent;
315 	group_t		*children;
316 	cpu_t		*cpu;
317 	group_iter_t	iter;
318 	pg_cpu_itr_t	cpu_iter;
319 	int		r;
320 	int		err;
321 
322 	ASSERT(MUTEX_HELD(&cpu_lock));
323 
324 	parent = pg->cmt_parent;
325 	if (parent == NULL) {
326 		/*
327 		 * Nothing to do
328 		 */
329 		return;
330 	}
331 
332 	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
333 
334 	/*
335 	 * We're changing around the hierarchy, which is actively traversed
336 	 * by the dispatcher. Pause CPUS to ensure exclusivity.
337 	 */
338 	pause_cpus(NULL);
339 
340 	/*
341 	 * If necessary, update the parent's sibling set, replacing parent
342 	 * with PG.
343 	 */
344 	if (parent->cmt_siblings) {
345 		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
346 		    != -1) {
347 			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
348 			ASSERT(r != -1);
349 		}
350 	}
351 
352 	/*
353 	 * If the parent is at the top of the hierarchy, replace it's entry
354 	 * in the root lgroup's group of top level PGs.
355 	 */
356 	if (parent->cmt_parent == NULL &&
357 	    parent->cmt_siblings != &cmt_root->cl_pgs) {
358 		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
359 		    != -1) {
360 			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
361 			ASSERT(r != -1);
362 		}
363 	}
364 
365 	/*
366 	 * We assume (and therefore assert) that the PG being promoted is an
367 	 * only child of it's parent. Update the parent's children set
368 	 * replacing PG's entry with the parent (since the parent is becoming
369 	 * the child). Then have PG and the parent swap children sets.
370 	 */
371 	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
372 	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
373 		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
374 		ASSERT(r != -1);
375 	}
376 
377 	children = pg->cmt_children;
378 	pg->cmt_children = parent->cmt_children;
379 	parent->cmt_children = children;
380 
381 	/*
382 	 * Update the sibling references for PG and it's parent
383 	 */
384 	pg->cmt_siblings = parent->cmt_siblings;
385 	parent->cmt_siblings = pg->cmt_children;
386 
387 	/*
388 	 * Update any cached lineages in the per CPU pg data.
389 	 */
390 	PG_CPU_ITR_INIT(pg, cpu_iter);
391 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
392 		int		idx;
393 		group_t		*pgs;
394 		pg_cmt_t	*cpu_pg;
395 
396 		/*
397 		 * Iterate over the CPU's PGs updating the children
398 		 * of the PG being promoted, since they have a new parent.
399 		 */
400 		pgs = &cpu->cpu_pg->pgs;
401 		group_iter_init(&iter);
402 		while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) {
403 			if (cpu_pg->cmt_parent == pg) {
404 				cpu_pg->cmt_parent = parent;
405 			}
406 		}
407 
408 		/*
409 		 * Update the CMT load balancing lineage
410 		 */
411 		pgs = &cpu->cpu_pg->cmt_pgs;
412 		if ((idx = group_find(pgs, (void *)pg)) == -1) {
413 			/*
414 			 * Unless this is the CPU who's lineage is being
415 			 * constructed, the PG being promoted should be
416 			 * in the lineage.
417 			 */
418 			ASSERT(GROUP_SIZE(pgs) == 0);
419 			continue;
420 		}
421 
422 		ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent);
423 		ASSERT(idx > 0);
424 
425 		/*
426 		 * Have the child and the parent swap places in the CPU's
427 		 * lineage
428 		 */
429 		group_remove_at(pgs, idx);
430 		group_remove_at(pgs, idx - 1);
431 		err = group_add_at(pgs, parent, idx);
432 		ASSERT(err == 0);
433 		err = group_add_at(pgs, pg, idx - 1);
434 		ASSERT(err == 0);
435 	}
436 
437 	/*
438 	 * Update the parent references for PG and it's parent
439 	 */
440 	pg->cmt_parent = parent->cmt_parent;
441 	parent->cmt_parent = pg;
442 
443 	start_cpus();
444 }
445 
446 /*
447  * CMT class callback for a new CPU entering the system
448  */
449 static void
450 pg_cmt_cpu_init(cpu_t *cp)
451 {
452 	pg_cmt_t	*pg;
453 	group_t		*cmt_pgs;
454 	int		levels, level;
455 	pghw_type_t	hw;
456 	pg_t		*pg_cache = NULL;
457 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
458 	lgrp_handle_t	lgrp_handle;
459 	cmt_lgrp_t	*lgrp;
460 
461 	ASSERT(MUTEX_HELD(&cpu_lock));
462 
463 	if (cmt_sched_disabled)
464 		return;
465 
466 	/*
467 	 * A new CPU is coming into the system.
468 	 * Interrogate the platform to see if the CPU
469 	 * has any performance or efficiency relevant
470 	 * sharing relationships
471 	 */
472 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
473 	cp->cpu_pg->cmt_lineage = NULL;
474 
475 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
476 	levels = 0;
477 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
478 
479 		pg_cmt_policy_t	policy;
480 
481 		/*
482 		 * We're only interested in the hw sharing relationships
483 		 * for which we know how to optimize.
484 		 */
485 		policy = pg_cmt_policy(hw);
486 		if (policy == CMT_NO_POLICY ||
487 		    pg_plat_hw_shared(cp, hw) == 0)
488 			continue;
489 
490 		/*
491 		 * Continue if the hardware sharing relationship has been
492 		 * blacklisted.
493 		 */
494 		if (cmt_hw_blacklisted[hw]) {
495 			continue;
496 		}
497 
498 		/*
499 		 * Find (or create) the PG associated with
500 		 * the hw sharing relationship in which cp
501 		 * belongs.
502 		 *
503 		 * Determine if a suitable PG already
504 		 * exists, or if one needs to be created.
505 		 */
506 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
507 		if (pg == NULL) {
508 			/*
509 			 * Create a new one.
510 			 * Initialize the common...
511 			 */
512 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
513 
514 			/* ... physical ... */
515 			pghw_init((pghw_t *)pg, cp, hw);
516 
517 			/*
518 			 * ... and CMT specific portions of the
519 			 * structure.
520 			 */
521 			pg->cmt_policy = policy;
522 
523 			/* CMT event callbacks */
524 			cmt_callback_init((pg_t *)pg);
525 
526 			bitset_init(&pg->cmt_cpus_actv_set);
527 			group_create(&pg->cmt_cpus_actv);
528 		} else {
529 			ASSERT(IS_CMT_PG(pg));
530 		}
531 
532 		/* Add the CPU to the PG */
533 		pg_cpu_add((pg_t *)pg, cp);
534 
535 		/*
536 		 * Ensure capacity of the active CPU group/bitset
537 		 */
538 		group_expand(&pg->cmt_cpus_actv,
539 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
540 
541 		if (cp->cpu_seqid >=
542 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
543 			bitset_resize(&pg->cmt_cpus_actv_set,
544 			    cp->cpu_seqid + 1);
545 		}
546 
547 		/*
548 		 * Build a lineage of CMT PGs for load balancing / coalescence
549 		 */
550 		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
551 			cpu_cmt_hier[levels++] = pg;
552 		}
553 
554 		/* Cache this for later */
555 		if (hw == PGHW_CACHE)
556 			pg_cache = (pg_t *)pg;
557 	}
558 
559 	group_expand(cmt_pgs, levels);
560 
561 	if (cmt_root == NULL)
562 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
563 
564 	/*
565 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
566 	 */
567 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
568 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
569 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
570 
571 	/*
572 	 * Ascendingly sort the PGs in the lineage by number of CPUs
573 	 */
574 	pg_cmt_hier_sort(cpu_cmt_hier, levels);
575 
576 	/*
577 	 * Examine the lineage and validate it.
578 	 * This routine will also try to fix the lineage along with the
579 	 * rest of the PG hierarchy should it detect an issue.
580 	 *
581 	 * If it returns -1, an unrecoverable error has happened and we
582 	 * need to return.
583 	 */
584 	if (pg_cmt_lineage_validate(cpu_cmt_hier, &levels) < 0)
585 		return;
586 
587 	/*
588 	 * For existing PGs in the lineage, verify that the parent is
589 	 * correct, as the generation in the lineage may have changed
590 	 * as a result of the sorting. Start the traversal at the top
591 	 * of the lineage, moving down.
592 	 */
593 	for (level = levels - 1; level >= 0; ) {
594 		int reorg;
595 
596 		reorg = 0;
597 		pg = cpu_cmt_hier[level];
598 
599 		/*
600 		 * Promote PGs at an incorrect generation into place.
601 		 */
602 		while (pg->cmt_parent &&
603 		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
604 			cmt_hier_promote(pg);
605 			reorg++;
606 		}
607 		if (reorg > 0)
608 			level = levels - 1;
609 		else
610 			level--;
611 	}
612 
613 	/*
614 	 * For each of the PGs in the CPU's lineage:
615 	 *	- Add an entry in the CPU sorted CMT PG group
616 	 *	  which is used for top down CMT load balancing
617 	 *	- Tie the PG into the CMT hierarchy by connecting
618 	 *	  it to it's parent and siblings.
619 	 */
620 	for (level = 0; level < levels; level++) {
621 		uint_t		children;
622 		int		err;
623 
624 		pg = cpu_cmt_hier[level];
625 		err = group_add_at(cmt_pgs, pg, levels - level - 1);
626 		ASSERT(err == 0);
627 
628 		if (level == 0)
629 			cp->cpu_pg->cmt_lineage = (pg_t *)pg;
630 
631 		if (pg->cmt_siblings != NULL) {
632 			/* Already initialized */
633 			ASSERT(pg->cmt_parent == NULL ||
634 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
635 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
636 			    ((pg->cmt_parent != NULL) &&
637 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
638 			continue;
639 		}
640 
641 		if ((level + 1) == levels) {
642 			pg->cmt_parent = NULL;
643 
644 			pg->cmt_siblings = &lgrp->cl_pgs;
645 			children = ++lgrp->cl_npgs;
646 			if (cmt_root != lgrp)
647 				cmt_root->cl_npgs++;
648 		} else {
649 			pg->cmt_parent = cpu_cmt_hier[level + 1];
650 
651 			/*
652 			 * A good parent keeps track of their children.
653 			 * The parent's children group is also the PG's
654 			 * siblings.
655 			 */
656 			if (pg->cmt_parent->cmt_children == NULL) {
657 				pg->cmt_parent->cmt_children =
658 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
659 				group_create(pg->cmt_parent->cmt_children);
660 			}
661 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
662 			children = ++pg->cmt_parent->cmt_nchildren;
663 		}
664 
665 		group_expand(pg->cmt_siblings, children);
666 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
667 	}
668 
669 	/*
670 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
671 	 * for fast lookups later.
672 	 */
673 	if (cp->cpu_physid) {
674 		cp->cpu_physid->cpu_chipid =
675 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
676 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
677 
678 		/*
679 		 * If this cpu has a PG representing shared cache, then set
680 		 * cpu_cacheid to that PG's logical id
681 		 */
682 		if (pg_cache)
683 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
684 	}
685 
686 	/* CPU0 only initialization */
687 	if (is_cpu0) {
688 		pg_cmt_cpu_startup(cp);
689 		is_cpu0 = 0;
690 		cpu0_lgrp = lgrp;
691 	}
692 
693 }
694 
695 /*
696  * Class callback when a CPU is leaving the system (deletion)
697  */
698 static void
699 pg_cmt_cpu_fini(cpu_t *cp)
700 {
701 	group_iter_t	i;
702 	pg_cmt_t	*pg;
703 	group_t		*pgs, *cmt_pgs;
704 	lgrp_handle_t	lgrp_handle;
705 	cmt_lgrp_t	*lgrp;
706 
707 	if (cmt_sched_disabled)
708 		return;
709 
710 	pgs = &cp->cpu_pg->pgs;
711 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
712 
713 	/*
714 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
715 	 */
716 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
717 
718 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
719 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
720 		/*
721 		 * One might wonder how we could be deconfiguring the
722 		 * only CPU in the system.
723 		 *
724 		 * On Starcat systems when null_proc_lpa is detected,
725 		 * the boot CPU (which is already configured into a leaf
726 		 * lgroup), is moved into the root lgroup. This is done by
727 		 * deconfiguring it from both lgroups and processor
728 		 * groups), and then later reconfiguring it back in.  This
729 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
730 		 *
731 		 * This special case is detected by noting that the platform
732 		 * has changed the CPU's lgrp affiliation (since it now
733 		 * belongs in the root). In this case, use the cmt_lgrp_t
734 		 * cached for the boot CPU, since this is what needs to be
735 		 * torn down.
736 		 */
737 		lgrp = cpu0_lgrp;
738 	}
739 
740 	ASSERT(lgrp != NULL);
741 
742 	/*
743 	 * First, clean up anything load balancing specific for each of
744 	 * the CPU's PGs that participated in CMT load balancing
745 	 */
746 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
747 	while (pg != NULL) {
748 
749 		/*
750 		 * Remove the PG from the CPU's load balancing lineage
751 		 */
752 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
753 
754 		/*
755 		 * If it's about to become empty, destroy it's children
756 		 * group, and remove it's reference from it's siblings.
757 		 * This is done here (rather than below) to avoid removing
758 		 * our reference from a PG that we just eliminated.
759 		 */
760 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
761 			if (pg->cmt_children != NULL)
762 				group_destroy(pg->cmt_children);
763 			if (pg->cmt_siblings != NULL) {
764 				if (pg->cmt_siblings == &lgrp->cl_pgs)
765 					lgrp->cl_npgs--;
766 				else
767 					pg->cmt_parent->cmt_nchildren--;
768 			}
769 		}
770 		pg = pg->cmt_parent;
771 	}
772 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
773 
774 	/*
775 	 * Now that the load balancing lineage updates have happened,
776 	 * remove the CPU from all it's PGs (destroying any that become
777 	 * empty).
778 	 */
779 	group_iter_init(&i);
780 	while ((pg = group_iterate(pgs, &i)) != NULL) {
781 		if (IS_CMT_PG(pg) == 0)
782 			continue;
783 
784 		pg_cpu_delete((pg_t *)pg, cp);
785 		/*
786 		 * Deleting the CPU from the PG changes the CPU's
787 		 * PG group over which we are actively iterating
788 		 * Re-initialize the iteration
789 		 */
790 		group_iter_init(&i);
791 
792 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
793 
794 			/*
795 			 * The PG has become zero sized, so destroy it.
796 			 */
797 			group_destroy(&pg->cmt_cpus_actv);
798 			bitset_fini(&pg->cmt_cpus_actv_set);
799 			pghw_fini((pghw_t *)pg);
800 
801 			pg_destroy((pg_t *)pg);
802 		}
803 	}
804 }
805 
806 /*
807  * Class callback when a CPU is entering a cpu partition
808  */
809 static void
810 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
811 {
812 	group_t		*pgs;
813 	pg_t		*pg;
814 	group_iter_t	i;
815 
816 	ASSERT(MUTEX_HELD(&cpu_lock));
817 
818 	if (cmt_sched_disabled)
819 		return;
820 
821 	pgs = &cp->cpu_pg->pgs;
822 
823 	/*
824 	 * Ensure that the new partition's PG bitset
825 	 * is large enough for all CMT PG's to which cp
826 	 * belongs
827 	 */
828 	group_iter_init(&i);
829 	while ((pg = group_iterate(pgs, &i)) != NULL) {
830 		if (IS_CMT_PG(pg) == 0)
831 			continue;
832 
833 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
834 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
835 	}
836 }
837 
838 /*
839  * Class callback when a CPU is actually moving partitions
840  */
841 static void
842 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
843 {
844 	cpu_t		*cpp;
845 	group_t		*pgs;
846 	pg_t		*pg;
847 	group_iter_t	pg_iter;
848 	pg_cpu_itr_t	cpu_iter;
849 	boolean_t	found;
850 
851 	ASSERT(MUTEX_HELD(&cpu_lock));
852 
853 	if (cmt_sched_disabled)
854 		return;
855 
856 	pgs = &cp->cpu_pg->pgs;
857 	group_iter_init(&pg_iter);
858 
859 	/*
860 	 * Iterate over the CPUs CMT PGs
861 	 */
862 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
863 
864 		if (IS_CMT_PG(pg) == 0)
865 			continue;
866 
867 		/*
868 		 * Add the PG to the bitset in the new partition.
869 		 */
870 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
871 
872 		/*
873 		 * Remove the PG from the bitset in the old partition
874 		 * if the last of the PG's CPUs have left.
875 		 */
876 		found = B_FALSE;
877 		PG_CPU_ITR_INIT(pg, cpu_iter);
878 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
879 			if (cpp == cp)
880 				continue;
881 			if (CPU_ACTIVE(cpp) &&
882 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
883 				found = B_TRUE;
884 				break;
885 			}
886 		}
887 		if (!found)
888 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
889 	}
890 }
891 
892 /*
893  * Class callback when a CPU becomes active (online)
894  *
895  * This is called in a context where CPUs are paused
896  */
897 static void
898 pg_cmt_cpu_active(cpu_t *cp)
899 {
900 	int		err;
901 	group_iter_t	i;
902 	pg_cmt_t	*pg;
903 	group_t		*pgs;
904 
905 	ASSERT(MUTEX_HELD(&cpu_lock));
906 
907 	if (cmt_sched_disabled)
908 		return;
909 
910 	pgs = &cp->cpu_pg->pgs;
911 	group_iter_init(&i);
912 
913 	/*
914 	 * Iterate over the CPU's PGs
915 	 */
916 	while ((pg = group_iterate(pgs, &i)) != NULL) {
917 
918 		if (IS_CMT_PG(pg) == 0)
919 			continue;
920 
921 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
922 		ASSERT(err == 0);
923 
924 		/*
925 		 * If this is the first active CPU in the PG, and it
926 		 * represents a hardware sharing relationship over which
927 		 * CMT load balancing is performed, add it as a candidate
928 		 * for balancing with it's siblings.
929 		 */
930 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
931 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
932 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
933 			ASSERT(err == 0);
934 
935 			/*
936 			 * If this is a top level PG, add it as a balancing
937 			 * candidate when balancing within the root lgroup.
938 			 */
939 			if (pg->cmt_parent == NULL &&
940 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
941 				err = group_add(&cmt_root->cl_pgs, pg,
942 				    GRP_NORESIZE);
943 				ASSERT(err == 0);
944 			}
945 		}
946 
947 		/*
948 		 * Notate the CPU in the PGs active CPU bitset.
949 		 * Also notate the PG as being active in it's associated
950 		 * partition
951 		 */
952 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
953 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
954 	}
955 }
956 
957 /*
958  * Class callback when a CPU goes inactive (offline)
959  *
960  * This is called in a context where CPUs are paused
961  */
962 static void
963 pg_cmt_cpu_inactive(cpu_t *cp)
964 {
965 	int		err;
966 	group_t		*pgs;
967 	pg_cmt_t	*pg;
968 	cpu_t		*cpp;
969 	group_iter_t	i;
970 	pg_cpu_itr_t	cpu_itr;
971 	boolean_t	found;
972 
973 	ASSERT(MUTEX_HELD(&cpu_lock));
974 
975 	if (cmt_sched_disabled)
976 		return;
977 
978 	pgs = &cp->cpu_pg->pgs;
979 	group_iter_init(&i);
980 
981 	while ((pg = group_iterate(pgs, &i)) != NULL) {
982 
983 		if (IS_CMT_PG(pg) == 0)
984 			continue;
985 
986 		/*
987 		 * Remove the CPU from the CMT PGs active CPU group
988 		 * bitmap
989 		 */
990 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
991 		ASSERT(err == 0);
992 
993 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
994 
995 		/*
996 		 * If there are no more active CPUs in this PG over which
997 		 * load was balanced, remove it as a balancing candidate.
998 		 */
999 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
1000 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1001 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1002 			ASSERT(err == 0);
1003 
1004 			if (pg->cmt_parent == NULL &&
1005 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1006 				err = group_remove(&cmt_root->cl_pgs, pg,
1007 				    GRP_NORESIZE);
1008 				ASSERT(err == 0);
1009 			}
1010 		}
1011 
1012 		/*
1013 		 * Assert the number of active CPUs does not exceed
1014 		 * the total number of CPUs in the PG
1015 		 */
1016 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1017 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1018 
1019 		/*
1020 		 * Update the PG bitset in the CPU's old partition
1021 		 */
1022 		found = B_FALSE;
1023 		PG_CPU_ITR_INIT(pg, cpu_itr);
1024 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1025 			if (cpp == cp)
1026 				continue;
1027 			if (CPU_ACTIVE(cpp) &&
1028 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1029 				found = B_TRUE;
1030 				break;
1031 			}
1032 		}
1033 		if (!found) {
1034 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
1035 			    ((pg_t *)pg)->pg_id);
1036 		}
1037 	}
1038 }
1039 
1040 /*
1041  * Return non-zero if the CPU belongs in the given PG
1042  */
1043 static int
1044 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1045 {
1046 	cpu_t	*pg_cpu;
1047 
1048 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1049 
1050 	ASSERT(pg_cpu != NULL);
1051 
1052 	/*
1053 	 * The CPU belongs if, given the nature of the hardware sharing
1054 	 * relationship represented by the PG, the CPU has that
1055 	 * relationship with some other CPU already in the PG
1056 	 */
1057 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1058 		return (1);
1059 
1060 	return (0);
1061 }
1062 
1063 /*
1064  * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1065  */
1066 static void
1067 pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1068 {
1069 	int		i, j, inc;
1070 	pg_t		*tmp;
1071 	pg_t		**h = (pg_t **)hier;
1072 
1073 	/*
1074 	 * First sort by number of CPUs
1075 	 */
1076 	inc = size / 2;
1077 	while (inc > 0) {
1078 		for (i = inc; i < size; i++) {
1079 			j = i;
1080 			tmp = h[i];
1081 			while ((j >= inc) &&
1082 			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
1083 				h[j] = h[j - inc];
1084 				j = j - inc;
1085 			}
1086 			h[j] = tmp;
1087 		}
1088 		if (inc == 2)
1089 			inc = 1;
1090 		else
1091 			inc = (inc * 5) / 11;
1092 	}
1093 
1094 	/*
1095 	 * Break ties by asking the platform.
1096 	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
1097 	 */
1098 	for (i = 0; i < size - 1; i++) {
1099 		if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) &&
1100 		    pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) {
1101 			tmp = h[i];
1102 			h[i] = h[i + 1];
1103 			h[i + 1] = tmp;
1104 		}
1105 	}
1106 }
1107 
1108 /*
1109  * Return a cmt_lgrp_t * given an lgroup handle.
1110  */
1111 static cmt_lgrp_t *
1112 pg_cmt_find_lgrp(lgrp_handle_t hand)
1113 {
1114 	cmt_lgrp_t	*lgrp;
1115 
1116 	ASSERT(MUTEX_HELD(&cpu_lock));
1117 
1118 	lgrp = cmt_lgrps;
1119 	while (lgrp != NULL) {
1120 		if (lgrp->cl_hand == hand)
1121 			break;
1122 		lgrp = lgrp->cl_next;
1123 	}
1124 	return (lgrp);
1125 }
1126 
1127 /*
1128  * Create a cmt_lgrp_t with the specified handle.
1129  */
1130 static cmt_lgrp_t *
1131 pg_cmt_lgrp_create(lgrp_handle_t hand)
1132 {
1133 	cmt_lgrp_t	*lgrp;
1134 
1135 	ASSERT(MUTEX_HELD(&cpu_lock));
1136 
1137 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1138 
1139 	lgrp->cl_hand = hand;
1140 	lgrp->cl_npgs = 0;
1141 	lgrp->cl_next = cmt_lgrps;
1142 	cmt_lgrps = lgrp;
1143 	group_create(&lgrp->cl_pgs);
1144 
1145 	return (lgrp);
1146 }
1147 
1148 /*
1149  * Interfaces to enable and disable power aware dispatching
1150  * The caller must be holding cpu_lock.
1151  *
1152  * Return 0 on success and -1 on failure.
1153  */
1154 int
1155 cmt_pad_enable(pghw_type_t type)
1156 {
1157 	group_t		*hwset;
1158 	group_iter_t	iter;
1159 	pg_cmt_t	*pg;
1160 
1161 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1162 	ASSERT(MUTEX_HELD(&cpu_lock));
1163 
1164 	if ((hwset = pghw_set_lookup(type)) == NULL ||
1165 	    cmt_hw_blacklisted[type]) {
1166 		/*
1167 		 * Unable to find any instances of the specified type
1168 		 * of power domain, or the power domains have been blacklisted.
1169 		 */
1170 		return (-1);
1171 	}
1172 
1173 	/*
1174 	 * Iterate over the power domains, setting the default dispatcher
1175 	 * policy for power/performance optimization.
1176 	 *
1177 	 * Simply setting the policy isn't enough in the case where the power
1178 	 * domain is an only child of another PG. Because the dispatcher walks
1179 	 * the PG hierarchy in a top down fashion, the higher up PG's policy
1180 	 * will dominate. So promote the power domain above it's parent if both
1181 	 * PG and it's parent have the same CPUs to ensure it's policy
1182 	 * dominates.
1183 	 */
1184 	group_iter_init(&iter);
1185 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1186 		/*
1187 		 * If the power domain is an only child to a parent
1188 		 * not implementing the same policy, promote the child
1189 		 * above the parent to activate the policy.
1190 		 */
1191 		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
1192 		while ((pg->cmt_parent != NULL) &&
1193 		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
1194 		    (PG_NUM_CPUS((pg_t *)pg) ==
1195 		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
1196 			cmt_hier_promote(pg);
1197 		}
1198 	}
1199 
1200 	return (0);
1201 }
1202 
1203 int
1204 cmt_pad_disable(pghw_type_t type)
1205 {
1206 	group_t		*hwset;
1207 	group_iter_t	iter;
1208 	pg_cmt_t	*pg;
1209 	pg_cmt_t	*child;
1210 
1211 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1212 	ASSERT(MUTEX_HELD(&cpu_lock));
1213 
1214 	if ((hwset = pghw_set_lookup(type)) == NULL) {
1215 		/*
1216 		 * Unable to find any instances of the specified type of
1217 		 * power domain.
1218 		 */
1219 		return (-1);
1220 	}
1221 	/*
1222 	 * Iterate over the power domains, setting the default dispatcher
1223 	 * policy for performance optimization (load balancing).
1224 	 */
1225 	group_iter_init(&iter);
1226 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1227 
1228 		/*
1229 		 * If the power domain has an only child that implements
1230 		 * policy other than load balancing, promote the child
1231 		 * above the power domain to ensure it's policy dominates.
1232 		 */
1233 		if (pg->cmt_children != NULL &&
1234 		    GROUP_SIZE(pg->cmt_children) == 1) {
1235 			child = GROUP_ACCESS(pg->cmt_children, 0);
1236 			if ((child->cmt_policy & CMT_BALANCE) == 0) {
1237 				cmt_hier_promote(child);
1238 			}
1239 		}
1240 		pg->cmt_policy = CMT_BALANCE;
1241 	}
1242 	return (0);
1243 }
1244 
1245 /* ARGSUSED */
1246 static void
1247 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1248 		    kthread_t *new)
1249 {
1250 	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;
1251 
1252 	if (old == cp->cpu_idle_thread) {
1253 		atomic_add_32(&cmt_pg->cmt_utilization, 1);
1254 	} else if (new == cp->cpu_idle_thread) {
1255 		atomic_add_32(&cmt_pg->cmt_utilization, -1);
1256 	}
1257 }
1258 
1259 /*
1260  * Macro to test whether a thread is currently runnable on a CPU in a PG.
1261  */
1262 #define	THREAD_RUNNABLE_IN_PG(t, pg)					\
1263 	((t)->t_state == TS_RUN &&					\
1264 	    (t)->t_disp_queue->disp_cpu &&				\
1265 	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
1266 	    (t)->t_disp_queue->disp_cpu->cpu_seqid))
1267 
1268 static void
1269 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1270     kthread_t *new)
1271 {
1272 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1273 	cpupm_domain_t	*dom;
1274 	uint32_t	u;
1275 
1276 	if (old == cp->cpu_idle_thread) {
1277 		ASSERT(new != cp->cpu_idle_thread);
1278 		u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
1279 		if (u == 1) {
1280 			/*
1281 			 * Notify the CPU power manager that the domain
1282 			 * is non-idle.
1283 			 */
1284 			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1285 			cpupm_utilization_event(cp, now, dom,
1286 			    CPUPM_DOM_BUSY_FROM_IDLE);
1287 		}
1288 	} else if (new == cp->cpu_idle_thread) {
1289 		ASSERT(old != cp->cpu_idle_thread);
1290 		u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
1291 		if (u == 0) {
1292 			/*
1293 			 * The domain is idle, notify the CPU power
1294 			 * manager.
1295 			 *
1296 			 * Avoid notifying if the thread is simply migrating
1297 			 * between CPUs in the domain.
1298 			 */
1299 			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
1300 				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1301 				cpupm_utilization_event(cp, now, dom,
1302 				    CPUPM_DOM_IDLE_FROM_BUSY);
1303 			}
1304 		}
1305 	}
1306 }
1307 
1308 /* ARGSUSED */
1309 static void
1310 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
1311 {
1312 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1313 	cpupm_domain_t	*dom;
1314 
1315 	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1316 	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
1317 }
1318 
1319 /*
1320  * Return the name of the CMT scheduling policy
1321  * being implemented across this PG
1322  */
1323 static char *
1324 pg_cmt_policy_name(pg_t *pg)
1325 {
1326 	pg_cmt_policy_t policy;
1327 
1328 	policy = ((pg_cmt_t *)pg)->cmt_policy;
1329 
1330 	if (policy & CMT_AFFINITY) {
1331 		if (policy & CMT_BALANCE)
1332 			return ("Load Balancing & Affinity");
1333 		else if (policy & CMT_COALESCE)
1334 			return ("Load Coalescence & Affinity");
1335 		else
1336 			return ("Affinity");
1337 	} else {
1338 		if (policy & CMT_BALANCE)
1339 			return ("Load Balancing");
1340 		else if (policy & CMT_COALESCE)
1341 			return ("Load Coalescence");
1342 		else
1343 			return ("None");
1344 	}
1345 }
1346 
1347 /*
1348  * Prune PG, and all other instances of PG's hardware sharing relationship
1349  * from the PG hierarchy.
1350  */
1351 static int
1352 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz)
1353 {
1354 	group_t		*hwset, *children;
1355 	int		i, j, r, size = *sz;
1356 	group_iter_t	hw_iter, child_iter;
1357 	pg_cpu_itr_t	cpu_iter;
1358 	pg_cmt_t	*pg, *child;
1359 	cpu_t		*cpu;
1360 	int		cap_needed;
1361 	pghw_type_t	hw;
1362 
1363 	ASSERT(MUTEX_HELD(&cpu_lock));
1364 
1365 	hw = ((pghw_t *)pg_bad)->pghw_hw;
1366 
1367 	if (hw == PGHW_POW_ACTIVE) {
1368 		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
1369 		    "Event Based CPUPM Unavailable");
1370 	} else if (hw == PGHW_POW_IDLE) {
1371 		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
1372 		    "Dispatcher assisted CPUPM disabled.");
1373 	}
1374 
1375 	/*
1376 	 * Find and eliminate the PG from the lineage.
1377 	 */
1378 	for (i = 0; i < size; i++) {
1379 		if (lineage[i] == pg_bad) {
1380 			for (j = i; j < size - 1; j++)
1381 				lineage[j] = lineage[j + 1];
1382 			*sz = size - 1;
1383 			break;
1384 		}
1385 	}
1386 
1387 	/*
1388 	 * We'll prune all instances of the hardware sharing relationship
1389 	 * represented by pg. But before we do that (and pause CPUs) we need
1390 	 * to ensure the hierarchy's groups are properly sized.
1391 	 */
1392 	hwset = pghw_set_lookup(hw);
1393 
1394 	/*
1395 	 * Blacklist the hardware so that future groups won't be created.
1396 	 */
1397 	cmt_hw_blacklisted[hw] = 1;
1398 
1399 	/*
1400 	 * For each of the PGs being pruned, ensure sufficient capacity in
1401 	 * the siblings set for the PG's children
1402 	 */
1403 	group_iter_init(&hw_iter);
1404 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1405 		/*
1406 		 * PG is being pruned, but if it is bringing up more than
1407 		 * one child, ask for more capacity in the siblings group.
1408 		 */
1409 		cap_needed = 0;
1410 		if (pg->cmt_children &&
1411 		    GROUP_SIZE(pg->cmt_children) > 1) {
1412 			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
1413 
1414 			group_expand(pg->cmt_siblings,
1415 			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);
1416 
1417 			/*
1418 			 * If this is a top level group, also ensure the
1419 			 * capacity in the root lgrp level CMT grouping.
1420 			 */
1421 			if (pg->cmt_parent == NULL &&
1422 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1423 				group_expand(&cmt_root->cl_pgs,
1424 				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1425 			}
1426 		}
1427 	}
1428 
1429 	/*
1430 	 * We're operating on the PG hierarchy. Pause CPUs to ensure
1431 	 * exclusivity with respect to the dispatcher.
1432 	 */
1433 	pause_cpus(NULL);
1434 
1435 	/*
1436 	 * Prune all PG instances of the hardware sharing relationship
1437 	 * represented by pg.
1438 	 */
1439 	group_iter_init(&hw_iter);
1440 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1441 
1442 		/*
1443 		 * Remove PG from it's group of siblings, if it's there.
1444 		 */
1445 		if (pg->cmt_siblings) {
1446 			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1447 		}
1448 		if (pg->cmt_parent == NULL &&
1449 		    pg->cmt_siblings != &cmt_root->cl_pgs) {
1450 			(void) group_remove(&cmt_root->cl_pgs, pg,
1451 			    GRP_NORESIZE);
1452 		}
1453 		/*
1454 		 * Add PGs children to it's group of siblings.
1455 		 */
1456 		if (pg->cmt_children != NULL) {
1457 			children = pg->cmt_children;
1458 
1459 			group_iter_init(&child_iter);
1460 			while ((child = group_iterate(children, &child_iter))
1461 			    != NULL) {
1462 				/*
1463 				 * Transplant child from it's siblings set to
1464 				 * PGs.
1465 				 */
1466 				if (pg->cmt_siblings != NULL &&
1467 				    child->cmt_siblings != NULL &&
1468 				    group_remove(child->cmt_siblings, child,
1469 				    GRP_NORESIZE) != -1) {
1470 					r = group_add(pg->cmt_siblings, child,
1471 					    GRP_NORESIZE);
1472 					ASSERT(r == 0);
1473 				}
1474 			}
1475 		}
1476 
1477 		/*
1478 		 * Reset the callbacks to the defaults
1479 		 */
1480 		pg_callback_set_defaults((pg_t *)pg);
1481 
1482 		/*
1483 		 * Update all the CPU lineages in each of PG's CPUs
1484 		 */
1485 		PG_CPU_ITR_INIT(pg, cpu_iter);
1486 		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1487 			group_t		*pgs;
1488 			pg_cmt_t	*cpu_pg;
1489 			group_iter_t	liter;	/* Iterator for the lineage */
1490 
1491 			/*
1492 			 * Iterate over the CPU's PGs updating the children
1493 			 * of the PG being promoted, since they have a new
1494 			 * parent and siblings set.
1495 			 */
1496 			pgs = &cpu->cpu_pg->pgs;
1497 			group_iter_init(&liter);
1498 			while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) {
1499 				if (cpu_pg->cmt_parent == pg) {
1500 					cpu_pg->cmt_parent = pg->cmt_parent;
1501 					cpu_pg->cmt_siblings = pg->cmt_siblings;
1502 				}
1503 			}
1504 
1505 			/*
1506 			 * Update the CPU's lineages
1507 			 */
1508 			pgs = &cpu->cpu_pg->cmt_pgs;
1509 			(void) group_remove(pgs, pg, GRP_NORESIZE);
1510 			pgs = &cpu->cpu_pg->pgs;
1511 			(void) group_remove(pgs, pg, GRP_NORESIZE);
1512 		}
1513 	}
1514 	start_cpus();
1515 	return (0);
1516 }
1517 
1518 /*
1519  * Disable CMT scheduling
1520  */
1521 static void
1522 pg_cmt_disable(void)
1523 {
1524 	cpu_t	*cpu;
1525 
1526 	pause_cpus(NULL);
1527 	cpu = cpu_list;
1528 
1529 	do {
1530 		if (cpu->cpu_pg)
1531 			group_empty(&cpu->cpu_pg->cmt_pgs);
1532 	} while ((cpu = cpu->cpu_next) != cpu_list);
1533 
1534 	cmt_sched_disabled = 1;
1535 	start_cpus();
1536 	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
1537 }
1538 
1539 static int
1540 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz)
1541 {
1542 	int		i, size;
1543 	pg_cmt_t	*pg, *parent, *pg_bad;
1544 	cpu_t		*cp;
1545 	pg_cpu_itr_t	cpu_iter;
1546 
1547 	ASSERT(MUTEX_HELD(&cpu_lock));
1548 
1549 revalidate:
1550 	size = *sz;
1551 	pg_bad = NULL;
1552 	for (i = 0; i < size - 1; i++) {
1553 
1554 		pg = lineage[i];
1555 		parent = lineage[i + 1];
1556 
1557 		/*
1558 		 * We assume that the lineage has already been sorted
1559 		 * by the number of CPUs. In fact, we depend on it.
1560 		 */
1561 		ASSERT(PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)parent));
1562 
1563 		/*
1564 		 * Walk each of the CPUs in the PGs group, and verify that
1565 		 * the next larger PG contains at least the CPUs in this one.
1566 		 */
1567 		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
1568 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1569 			if (pg_cpu_find((pg_t *)parent, cp) == B_FALSE) {
1570 				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
1571 				goto handle_error;
1572 			}
1573 		}
1574 	}
1575 
1576 handle_error:
1577 	switch (cmt_lineage_status) {
1578 	case CMT_LINEAGE_VALID:
1579 	case CMT_LINEAGE_REPAIRED:
1580 		break;
1581 	case CMT_LINEAGE_NON_CONCENTRIC:
1582 		/*
1583 		 * We've detected a non-concentric PG lineage.
1584 		 *
1585 		 * This can happen when some of the CPU grouping information
1586 		 * is derived from buggy sources (for example, incorrect ACPI
1587 		 * tables on x86 systems).
1588 		 *
1589 		 * We attempt to recover from this by pruning out the
1590 		 * illegal groupings from the PG hierarchy, which means that
1591 		 * we won't optimize for those levels, but we will for the
1592 		 * remaining ones.
1593 		 *
1594 		 * If a given level has CPUs not found in it's parent, then
1595 		 * we examine the PG and it's parent to see if either grouping
1596 		 * is enumerated from potentially buggy sources.
1597 		 *
1598 		 * If one has less CPUs than the other, and contains CPUs
1599 		 * not found in the parent, and it is an untrusted enumeration,
1600 		 * then prune it. If both have the same number of CPUs, then
1601 		 * prune the one that is untrusted.
1602 		 *
1603 		 * This process repeats until we have a concentric lineage,
1604 		 * or we would have to prune out level derived from what we
1605 		 * thought was a reliable source, in which case CMT scheduling
1606 		 * is disabled all together.
1607 		 */
1608 		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)parent)) &&
1609 		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
1610 			pg_bad = pg;
1611 		} else if (PG_NUM_CPUS((pg_t *)pg) ==
1612 		    PG_NUM_CPUS((pg_t *)parent)) {
1613 			if (PG_CMT_HW_SUSPECT(((pghw_t *)parent)->pghw_hw)) {
1614 				pg_bad = parent;
1615 			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1616 				pg_bad = pg;
1617 			}
1618 		}
1619 		if (pg_bad) {
1620 			if (pg_cmt_prune(pg_bad, lineage, sz) == 0) {
1621 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1622 				goto revalidate;
1623 			}
1624 		}
1625 		/*FALLTHROUGH*/
1626 	default:
1627 		/*
1628 		 * If we're here, something has gone wrong in trying to
1629 		 * recover from a illegal PG hierarchy, or we've encountered
1630 		 * a validation error for which we don't know how to recover.
1631 		 * In this case, disable CMT scheduling all together.
1632 		 */
1633 		pg_cmt_disable();
1634 		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1635 		return (-1);
1636 	}
1637 	return (0);
1638 }
1639