xref: /titanic_51/usr/src/uts/common/disp/cmt.c (revision 8cb74972a66bde0af7b1a957d01e0095b82a8b91)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/thread.h>
30 #include <sys/cpuvar.h>
31 #include <sys/cpupart.h>
32 #include <sys/kmem.h>
33 #include <sys/cmn_err.h>
34 #include <sys/kstat.h>
35 #include <sys/processor.h>
36 #include <sys/disp.h>
37 #include <sys/group.h>
38 #include <sys/pghw.h>
39 #include <sys/bitset.h>
40 #include <sys/lgrp.h>
41 #include <sys/cmt.h>
42 #include <sys/cpu_pm.h>
43 
44 /*
45  * CMT scheduler / dispatcher support
46  *
47  * This file implements CMT scheduler support using Processor Groups.
48  * The CMT processor group class creates and maintains the CMT class
49  * specific processor group pg_cmt_t.
50  *
51  * ---------------------------- <-- pg_cmt_t *
52  * | pghw_t                   |
53  * ----------------------------
54  * | CMT class specific data  |
55  * | - hierarchy linkage      |
56  * | - CMT load balancing data|
57  * | - active CPU group/bitset|
58  * ----------------------------
59  *
60  * The scheduler/dispatcher leverages knowledge of the performance
61  * relevant CMT sharing relationships existing between cpus to implement
62  * optimized affinity, load balancing, and coalescence policies.
63  *
64  * Load balancing policy seeks to improve performance by minimizing
65  * contention over shared processor resources / facilities, Affinity
66  * policies seek to improve cache and TLB utilization. Coalescence
67  * policies improve resource utilization and ultimately power efficiency.
68  *
69  * The CMT PGs created by this class are already arranged into a
70  * hierarchy (which is done in the pghw layer). To implement the top-down
71  * CMT load balancing algorithm, the CMT PGs additionally maintain
72  * parent, child and sibling hierarchy relationships.
73  * Parent PGs always contain a superset of their children(s) resources,
74  * each PG can have at most one parent, and siblings are the group of PGs
75  * sharing the same parent.
76  *
77  * On NUMA systems, the CMT load balancing algorithm balances across the
78  * CMT PGs within their respective lgroups. On UMA based system, there
79  * exists a top level group of PGs to balance across. On NUMA systems multiple
80  * top level groups are instantiated, where the top level balancing begins by
81  * balancng across the CMT PGs within their respective (per lgroup) top level
82  * groups.
83  */
84 static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
85 static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
86 						/* used for null_proc_lpa */
87 cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */
88 
89 static int		is_cpu0 = 1; /* true if this is boot CPU context */
90 
91 /*
92  * Array of hardware sharing relationships that are blacklisted.
93  * PGs won't be instantiated for blacklisted hardware sharing relationships.
94  */
95 static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
96 
97 /*
98  * Set this to non-zero to disable CMT scheduling
99  * This must be done via kmdb -d, as /etc/system will be too late
100  */
101 int			cmt_sched_disabled = 0;
102 
103 /*
104  * Status codes for CMT lineage validation
105  * See pg_cmt_lineage_validate() below
106  */
107 typedef enum cmt_lineage_validation {
108 	CMT_LINEAGE_VALID,
109 	CMT_LINEAGE_NON_CONCENTRIC,
110 	CMT_LINEAGE_PG_SPANS_LGRPS,
111 	CMT_LINEAGE_NON_PROMOTABLE,
112 	CMT_LINEAGE_REPAIRED,
113 	CMT_LINEAGE_UNRECOVERABLE
114 } cmt_lineage_validation_t;
115 
116 /*
117  * Status of the current lineage under construction.
118  * One must be holding cpu_lock to change this.
119  */
120 cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;
121 
122 /*
123  * Power domain definitions (on x86) are defined by ACPI, and
124  * therefore may be subject to BIOS bugs.
125  */
126 #define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)
127 
128 /*
129  * Macro to test if PG is managed by the CMT PG class
130  */
131 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
132 
133 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
134 
135 static pg_t		*pg_cmt_alloc();
136 static void		pg_cmt_free(pg_t *);
137 static void		pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
138 static void		pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
139 static void		pg_cmt_cpu_active(cpu_t *);
140 static void		pg_cmt_cpu_inactive(cpu_t *);
141 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
142 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
143 static char		*pg_cmt_policy_name(pg_t *);
144 static void		pg_cmt_hier_sort(pg_cmt_t **, int);
145 static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
146 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
147 static int		pg_cmt_hw(pghw_type_t);
148 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
149 static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
150 static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
151 			    kthread_t *, kthread_t *);
152 static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
153 			    kthread_t *, kthread_t *);
154 static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
155 static cmt_lineage_validation_t	pg_cmt_lineage_validate(pg_cmt_t **, int *);
156 
157 
158 /*
159  * CMT PG ops
160  */
161 struct pg_ops pg_ops_cmt = {
162 	pg_cmt_alloc,
163 	pg_cmt_free,
164 	pg_cmt_cpu_init,
165 	pg_cmt_cpu_fini,
166 	pg_cmt_cpu_active,
167 	pg_cmt_cpu_inactive,
168 	pg_cmt_cpupart_in,
169 	NULL,			/* cpupart_out */
170 	pg_cmt_cpupart_move,
171 	pg_cmt_cpu_belongs,
172 	pg_cmt_policy_name,
173 };
174 
175 /*
176  * Initialize the CMT PG class
177  */
178 void
179 pg_cmt_class_init(void)
180 {
181 	if (cmt_sched_disabled)
182 		return;
183 
184 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
185 }
186 
187 /*
188  * Called to indicate a new CPU has started up so
189  * that either t0 or the slave startup thread can
190  * be accounted for.
191  */
192 void
193 pg_cmt_cpu_startup(cpu_t *cp)
194 {
195 	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
196 	    cp->cpu_thread);
197 }
198 
199 /*
200  * Return non-zero if thread can migrate between "from" and "to"
201  * without a performance penalty
202  */
203 int
204 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
205 {
206 	if (from->cpu_physid->cpu_cacheid ==
207 	    to->cpu_physid->cpu_cacheid)
208 		return (1);
209 	return (0);
210 }
211 
212 /*
213  * CMT class specific PG allocation
214  */
215 static pg_t *
216 pg_cmt_alloc(void)
217 {
218 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
219 }
220 
221 /*
222  * Class specific PG de-allocation
223  */
224 static void
225 pg_cmt_free(pg_t *pg)
226 {
227 	ASSERT(pg != NULL);
228 	ASSERT(IS_CMT_PG(pg));
229 
230 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
231 }
232 
233 /*
234  * Given a hardware sharing relationship, return which dispatcher
235  * policies should be implemented to optimize performance and efficiency
236  */
237 static pg_cmt_policy_t
238 pg_cmt_policy(pghw_type_t hw)
239 {
240 	pg_cmt_policy_t p;
241 
242 	/*
243 	 * Give the platform a chance to override the default
244 	 */
245 	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
246 		return (p);
247 
248 	switch (hw) {
249 	case PGHW_IPIPE:
250 	case PGHW_FPU:
251 	case PGHW_CHIP:
252 		return (CMT_BALANCE);
253 	case PGHW_CACHE:
254 		return (CMT_AFFINITY);
255 	case PGHW_POW_ACTIVE:
256 	case PGHW_POW_IDLE:
257 		return (CMT_BALANCE);
258 	default:
259 		return (CMT_NO_POLICY);
260 	}
261 }
262 
263 /*
264  * Rank the importance of optimizing for the pg1 relationship vs.
265  * the pg2 relationship.
266  */
267 static pg_cmt_t *
268 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
269 {
270 	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
271 	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
272 
273 	/*
274 	 * A power domain is only important if CPUPM is enabled.
275 	 */
276 	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
277 		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
278 			return (pg2);
279 		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
280 			return (pg1);
281 	}
282 
283 	/*
284 	 * Otherwise, ask the platform
285 	 */
286 	if (pg_plat_hw_rank(hw1, hw2) == hw1)
287 		return (pg1);
288 	else
289 		return (pg2);
290 }
291 
292 /*
293  * Initialize CMT callbacks for the given PG
294  */
295 static void
296 cmt_callback_init(pg_t *pg)
297 {
298 	switch (((pghw_t *)pg)->pghw_hw) {
299 	case PGHW_POW_ACTIVE:
300 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
301 		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
302 		break;
303 	default:
304 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
305 
306 	}
307 }
308 
309 /*
310  * Promote PG above it's current parent.
311  * This is only legal if PG has an equal or greater number of CPUs
312  * than it's parent.
313  */
314 static void
315 cmt_hier_promote(pg_cmt_t *pg)
316 {
317 	pg_cmt_t	*parent;
318 	group_t		*children;
319 	cpu_t		*cpu;
320 	group_iter_t	iter;
321 	pg_cpu_itr_t	cpu_iter;
322 	int		r;
323 	int		err;
324 
325 	ASSERT(MUTEX_HELD(&cpu_lock));
326 
327 	parent = pg->cmt_parent;
328 	if (parent == NULL) {
329 		/*
330 		 * Nothing to do
331 		 */
332 		return;
333 	}
334 
335 	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
336 
337 	/*
338 	 * We're changing around the hierarchy, which is actively traversed
339 	 * by the dispatcher. Pause CPUS to ensure exclusivity.
340 	 */
341 	pause_cpus(NULL);
342 
343 	/*
344 	 * If necessary, update the parent's sibling set, replacing parent
345 	 * with PG.
346 	 */
347 	if (parent->cmt_siblings) {
348 		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
349 		    != -1) {
350 			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
351 			ASSERT(r != -1);
352 		}
353 	}
354 
355 	/*
356 	 * If the parent is at the top of the hierarchy, replace it's entry
357 	 * in the root lgroup's group of top level PGs.
358 	 */
359 	if (parent->cmt_parent == NULL &&
360 	    parent->cmt_siblings != &cmt_root->cl_pgs) {
361 		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
362 		    != -1) {
363 			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
364 			ASSERT(r != -1);
365 		}
366 	}
367 
368 	/*
369 	 * We assume (and therefore assert) that the PG being promoted is an
370 	 * only child of it's parent. Update the parent's children set
371 	 * replacing PG's entry with the parent (since the parent is becoming
372 	 * the child). Then have PG and the parent swap children sets.
373 	 */
374 	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
375 	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
376 		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
377 		ASSERT(r != -1);
378 	}
379 
380 	children = pg->cmt_children;
381 	pg->cmt_children = parent->cmt_children;
382 	parent->cmt_children = children;
383 
384 	/*
385 	 * Update the sibling references for PG and it's parent
386 	 */
387 	pg->cmt_siblings = parent->cmt_siblings;
388 	parent->cmt_siblings = pg->cmt_children;
389 
390 	/*
391 	 * Update any cached lineages in the per CPU pg data.
392 	 */
393 	PG_CPU_ITR_INIT(pg, cpu_iter);
394 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
395 		int		idx;
396 		group_t		*pgs;
397 		pg_cmt_t	*cpu_pg;
398 
399 		/*
400 		 * Iterate over the CPU's PGs updating the children
401 		 * of the PG being promoted, since they have a new parent.
402 		 */
403 		pgs = &cpu->cpu_pg->pgs;
404 		group_iter_init(&iter);
405 		while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) {
406 			if (cpu_pg->cmt_parent == pg) {
407 				cpu_pg->cmt_parent = parent;
408 			}
409 		}
410 
411 		/*
412 		 * Update the CMT load balancing lineage
413 		 */
414 		pgs = &cpu->cpu_pg->cmt_pgs;
415 		if ((idx = group_find(pgs, (void *)pg)) == -1) {
416 			/*
417 			 * Unless this is the CPU who's lineage is being
418 			 * constructed, the PG being promoted should be
419 			 * in the lineage.
420 			 */
421 			ASSERT(GROUP_SIZE(pgs) == 0);
422 			continue;
423 		}
424 
425 		ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent);
426 		ASSERT(idx > 0);
427 
428 		/*
429 		 * Have the child and the parent swap places in the CPU's
430 		 * lineage
431 		 */
432 		group_remove_at(pgs, idx);
433 		group_remove_at(pgs, idx - 1);
434 		err = group_add_at(pgs, parent, idx);
435 		ASSERT(err == 0);
436 		err = group_add_at(pgs, pg, idx - 1);
437 		ASSERT(err == 0);
438 	}
439 
440 	/*
441 	 * Update the parent references for PG and it's parent
442 	 */
443 	pg->cmt_parent = parent->cmt_parent;
444 	parent->cmt_parent = pg;
445 
446 	start_cpus();
447 }
448 
449 /*
450  * CMT class callback for a new CPU entering the system
451  */
452 static void
453 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *cpu_pg)
454 {
455 	pg_cmt_t	*pg;
456 	group_t		*cmt_pgs;
457 	int		levels, level;
458 	pghw_type_t	hw;
459 	pg_t		*pg_cache = NULL;
460 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
461 	lgrp_handle_t	lgrp_handle;
462 	cmt_lgrp_t	*lgrp;
463 	cmt_lineage_validation_t	lineage_status;
464 
465 	ASSERT(MUTEX_HELD(&cpu_lock));
466 
467 	if (cmt_sched_disabled)
468 		return;
469 
470 	/*
471 	 * A new CPU is coming into the system.
472 	 * Interrogate the platform to see if the CPU
473 	 * has any performance or efficiency relevant
474 	 * sharing relationships
475 	 */
476 	cmt_pgs = &cpu_pg->cmt_pgs;
477 	cpu_pg->cmt_lineage = NULL;
478 
479 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
480 	levels = 0;
481 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
482 
483 		pg_cmt_policy_t	policy;
484 
485 		/*
486 		 * We're only interested in the hw sharing relationships
487 		 * for which we know how to optimize.
488 		 */
489 		policy = pg_cmt_policy(hw);
490 		if (policy == CMT_NO_POLICY ||
491 		    pg_plat_hw_shared(cp, hw) == 0)
492 			continue;
493 
494 		/*
495 		 * Continue if the hardware sharing relationship has been
496 		 * blacklisted.
497 		 */
498 		if (cmt_hw_blacklisted[hw]) {
499 			continue;
500 		}
501 
502 		/*
503 		 * Find (or create) the PG associated with
504 		 * the hw sharing relationship in which cp
505 		 * belongs.
506 		 *
507 		 * Determine if a suitable PG already
508 		 * exists, or if one needs to be created.
509 		 */
510 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
511 		if (pg == NULL) {
512 			/*
513 			 * Create a new one.
514 			 * Initialize the common...
515 			 */
516 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
517 
518 			/* ... physical ... */
519 			pghw_init((pghw_t *)pg, cp, hw);
520 
521 			/*
522 			 * ... and CMT specific portions of the
523 			 * structure.
524 			 */
525 			pg->cmt_policy = policy;
526 
527 			/* CMT event callbacks */
528 			cmt_callback_init((pg_t *)pg);
529 
530 			bitset_init(&pg->cmt_cpus_actv_set);
531 			group_create(&pg->cmt_cpus_actv);
532 		} else {
533 			ASSERT(IS_CMT_PG(pg));
534 		}
535 
536 		/* Add the CPU to the PG */
537 		pg_cpu_add((pg_t *)pg, cp, cpu_pg);
538 
539 		/*
540 		 * Ensure capacity of the active CPU group/bitset
541 		 */
542 		group_expand(&pg->cmt_cpus_actv,
543 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
544 
545 		if (cp->cpu_seqid >=
546 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
547 			bitset_resize(&pg->cmt_cpus_actv_set,
548 			    cp->cpu_seqid + 1);
549 		}
550 
551 		/*
552 		 * Build a lineage of CMT PGs for load balancing / coalescence
553 		 */
554 		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
555 			cpu_cmt_hier[levels++] = pg;
556 		}
557 
558 		/* Cache this for later */
559 		if (hw == PGHW_CACHE)
560 			pg_cache = (pg_t *)pg;
561 	}
562 
563 	group_expand(cmt_pgs, levels);
564 
565 	if (cmt_root == NULL)
566 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
567 
568 	/*
569 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
570 	 */
571 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
572 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
573 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
574 
575 	/*
576 	 * Ascendingly sort the PGs in the lineage by number of CPUs
577 	 */
578 	pg_cmt_hier_sort(cpu_cmt_hier, levels);
579 
580 	/*
581 	 * Examine the lineage and validate it.
582 	 * This routine will also try to fix the lineage along with the
583 	 * rest of the PG hierarchy should it detect an issue.
584 	 *
585 	 * If it returns anything other than VALID or REPAIRED, an
586 	 * unrecoverable error has occurred, and we cannot proceed.
587 	 */
588 	lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels);
589 	if ((lineage_status != CMT_LINEAGE_VALID) &&
590 	    (lineage_status != CMT_LINEAGE_REPAIRED))
591 		return;
592 
593 	/*
594 	 * For existing PGs in the lineage, verify that the parent is
595 	 * correct, as the generation in the lineage may have changed
596 	 * as a result of the sorting. Start the traversal at the top
597 	 * of the lineage, moving down.
598 	 */
599 	for (level = levels - 1; level >= 0; ) {
600 		int reorg;
601 
602 		reorg = 0;
603 		pg = cpu_cmt_hier[level];
604 
605 		/*
606 		 * Promote PGs at an incorrect generation into place.
607 		 */
608 		while (pg->cmt_parent &&
609 		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
610 			cmt_hier_promote(pg);
611 			reorg++;
612 		}
613 		if (reorg > 0)
614 			level = levels - 1;
615 		else
616 			level--;
617 	}
618 
619 	/*
620 	 * For each of the PGs in the CPU's lineage:
621 	 *	- Add an entry in the CPU sorted CMT PG group
622 	 *	  which is used for top down CMT load balancing
623 	 *	- Tie the PG into the CMT hierarchy by connecting
624 	 *	  it to it's parent and siblings.
625 	 */
626 	for (level = 0; level < levels; level++) {
627 		uint_t		children;
628 		int		err;
629 
630 		pg = cpu_cmt_hier[level];
631 		err = group_add_at(cmt_pgs, pg, levels - level - 1);
632 		ASSERT(err == 0);
633 
634 		if (level == 0)
635 			cpu_pg->cmt_lineage = (pg_t *)pg;
636 
637 		if (pg->cmt_siblings != NULL) {
638 			/* Already initialized */
639 			ASSERT(pg->cmt_parent == NULL ||
640 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
641 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
642 			    ((pg->cmt_parent != NULL) &&
643 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
644 			continue;
645 		}
646 
647 		if ((level + 1) == levels) {
648 			pg->cmt_parent = NULL;
649 
650 			pg->cmt_siblings = &lgrp->cl_pgs;
651 			children = ++lgrp->cl_npgs;
652 			if (cmt_root != lgrp)
653 				cmt_root->cl_npgs++;
654 		} else {
655 			pg->cmt_parent = cpu_cmt_hier[level + 1];
656 
657 			/*
658 			 * A good parent keeps track of their children.
659 			 * The parent's children group is also the PG's
660 			 * siblings.
661 			 */
662 			if (pg->cmt_parent->cmt_children == NULL) {
663 				pg->cmt_parent->cmt_children =
664 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
665 				group_create(pg->cmt_parent->cmt_children);
666 			}
667 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
668 			children = ++pg->cmt_parent->cmt_nchildren;
669 		}
670 
671 		group_expand(pg->cmt_siblings, children);
672 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
673 	}
674 
675 	/*
676 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
677 	 * for fast lookups later.
678 	 */
679 	if (cp->cpu_physid) {
680 		cp->cpu_physid->cpu_chipid =
681 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
682 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
683 
684 		/*
685 		 * If this cpu has a PG representing shared cache, then set
686 		 * cpu_cacheid to that PG's logical id
687 		 */
688 		if (pg_cache)
689 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
690 	}
691 
692 	/* CPU0 only initialization */
693 	if (is_cpu0) {
694 		pg_cmt_cpu_startup(cp);
695 		is_cpu0 = 0;
696 		cpu0_lgrp = lgrp;
697 	}
698 
699 }
700 
701 /*
702  * Class callback when a CPU is leaving the system (deletion)
703  */
704 static void
705 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *cpu_pg)
706 {
707 	group_iter_t	i;
708 	pg_cmt_t	*pg;
709 	group_t		*pgs, *cmt_pgs;
710 	lgrp_handle_t	lgrp_handle;
711 	cmt_lgrp_t	*lgrp;
712 
713 	if (cmt_sched_disabled)
714 		return;
715 
716 	pgs = &cpu_pg->pgs;
717 	cmt_pgs = &cpu_pg->cmt_pgs;
718 
719 	/*
720 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
721 	 */
722 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
723 
724 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
725 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
726 		/*
727 		 * One might wonder how we could be deconfiguring the
728 		 * only CPU in the system.
729 		 *
730 		 * On Starcat systems when null_proc_lpa is detected,
731 		 * the boot CPU (which is already configured into a leaf
732 		 * lgroup), is moved into the root lgroup. This is done by
733 		 * deconfiguring it from both lgroups and processor
734 		 * groups), and then later reconfiguring it back in.  This
735 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
736 		 *
737 		 * This special case is detected by noting that the platform
738 		 * has changed the CPU's lgrp affiliation (since it now
739 		 * belongs in the root). In this case, use the cmt_lgrp_t
740 		 * cached for the boot CPU, since this is what needs to be
741 		 * torn down.
742 		 */
743 		lgrp = cpu0_lgrp;
744 	}
745 
746 	ASSERT(lgrp != NULL);
747 
748 	/*
749 	 * First, clean up anything load balancing specific for each of
750 	 * the CPU's PGs that participated in CMT load balancing
751 	 */
752 	pg = (pg_cmt_t *)cpu_pg->cmt_lineage;
753 	while (pg != NULL) {
754 
755 		/*
756 		 * Remove the PG from the CPU's load balancing lineage
757 		 */
758 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
759 
760 		/*
761 		 * If it's about to become empty, destroy it's children
762 		 * group, and remove it's reference from it's siblings.
763 		 * This is done here (rather than below) to avoid removing
764 		 * our reference from a PG that we just eliminated.
765 		 */
766 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
767 			if (pg->cmt_children != NULL)
768 				group_destroy(pg->cmt_children);
769 			if (pg->cmt_siblings != NULL) {
770 				if (pg->cmt_siblings == &lgrp->cl_pgs)
771 					lgrp->cl_npgs--;
772 				else
773 					pg->cmt_parent->cmt_nchildren--;
774 			}
775 		}
776 		pg = pg->cmt_parent;
777 	}
778 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
779 
780 	/*
781 	 * Now that the load balancing lineage updates have happened,
782 	 * remove the CPU from all it's PGs (destroying any that become
783 	 * empty).
784 	 */
785 	group_iter_init(&i);
786 	while ((pg = group_iterate(pgs, &i)) != NULL) {
787 		if (IS_CMT_PG(pg) == 0)
788 			continue;
789 
790 		pg_cpu_delete((pg_t *)pg, cp, cpu_pg);
791 		/*
792 		 * Deleting the CPU from the PG changes the CPU's
793 		 * PG group over which we are actively iterating
794 		 * Re-initialize the iteration
795 		 */
796 		group_iter_init(&i);
797 
798 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
799 
800 			/*
801 			 * The PG has become zero sized, so destroy it.
802 			 */
803 			group_destroy(&pg->cmt_cpus_actv);
804 			bitset_fini(&pg->cmt_cpus_actv_set);
805 			pghw_fini((pghw_t *)pg);
806 
807 			pg_destroy((pg_t *)pg);
808 		}
809 	}
810 }
811 
812 /*
813  * Class callback when a CPU is entering a cpu partition
814  */
815 static void
816 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
817 {
818 	group_t		*pgs;
819 	pg_t		*pg;
820 	group_iter_t	i;
821 
822 	ASSERT(MUTEX_HELD(&cpu_lock));
823 
824 	if (cmt_sched_disabled)
825 		return;
826 
827 	pgs = &cp->cpu_pg->pgs;
828 
829 	/*
830 	 * Ensure that the new partition's PG bitset
831 	 * is large enough for all CMT PG's to which cp
832 	 * belongs
833 	 */
834 	group_iter_init(&i);
835 	while ((pg = group_iterate(pgs, &i)) != NULL) {
836 		if (IS_CMT_PG(pg) == 0)
837 			continue;
838 
839 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
840 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
841 	}
842 }
843 
844 /*
845  * Class callback when a CPU is actually moving partitions
846  */
847 static void
848 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
849 {
850 	cpu_t		*cpp;
851 	group_t		*pgs;
852 	pg_t		*pg;
853 	group_iter_t	pg_iter;
854 	pg_cpu_itr_t	cpu_iter;
855 	boolean_t	found;
856 
857 	ASSERT(MUTEX_HELD(&cpu_lock));
858 
859 	if (cmt_sched_disabled)
860 		return;
861 
862 	pgs = &cp->cpu_pg->pgs;
863 	group_iter_init(&pg_iter);
864 
865 	/*
866 	 * Iterate over the CPUs CMT PGs
867 	 */
868 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
869 
870 		if (IS_CMT_PG(pg) == 0)
871 			continue;
872 
873 		/*
874 		 * Add the PG to the bitset in the new partition.
875 		 */
876 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
877 
878 		/*
879 		 * Remove the PG from the bitset in the old partition
880 		 * if the last of the PG's CPUs have left.
881 		 */
882 		found = B_FALSE;
883 		PG_CPU_ITR_INIT(pg, cpu_iter);
884 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
885 			if (cpp == cp)
886 				continue;
887 			if (CPU_ACTIVE(cpp) &&
888 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
889 				found = B_TRUE;
890 				break;
891 			}
892 		}
893 		if (!found)
894 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
895 	}
896 }
897 
898 /*
899  * Class callback when a CPU becomes active (online)
900  *
901  * This is called in a context where CPUs are paused
902  */
903 static void
904 pg_cmt_cpu_active(cpu_t *cp)
905 {
906 	int		err;
907 	group_iter_t	i;
908 	pg_cmt_t	*pg;
909 	group_t		*pgs;
910 
911 	ASSERT(MUTEX_HELD(&cpu_lock));
912 
913 	if (cmt_sched_disabled)
914 		return;
915 
916 	pgs = &cp->cpu_pg->pgs;
917 	group_iter_init(&i);
918 
919 	/*
920 	 * Iterate over the CPU's PGs
921 	 */
922 	while ((pg = group_iterate(pgs, &i)) != NULL) {
923 
924 		if (IS_CMT_PG(pg) == 0)
925 			continue;
926 
927 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
928 		ASSERT(err == 0);
929 
930 		/*
931 		 * If this is the first active CPU in the PG, and it
932 		 * represents a hardware sharing relationship over which
933 		 * CMT load balancing is performed, add it as a candidate
934 		 * for balancing with it's siblings.
935 		 */
936 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
937 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
938 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
939 			ASSERT(err == 0);
940 
941 			/*
942 			 * If this is a top level PG, add it as a balancing
943 			 * candidate when balancing within the root lgroup.
944 			 */
945 			if (pg->cmt_parent == NULL &&
946 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
947 				err = group_add(&cmt_root->cl_pgs, pg,
948 				    GRP_NORESIZE);
949 				ASSERT(err == 0);
950 			}
951 		}
952 
953 		/*
954 		 * Notate the CPU in the PGs active CPU bitset.
955 		 * Also notate the PG as being active in it's associated
956 		 * partition
957 		 */
958 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
959 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
960 	}
961 }
962 
963 /*
964  * Class callback when a CPU goes inactive (offline)
965  *
966  * This is called in a context where CPUs are paused
967  */
968 static void
969 pg_cmt_cpu_inactive(cpu_t *cp)
970 {
971 	int		err;
972 	group_t		*pgs;
973 	pg_cmt_t	*pg;
974 	cpu_t		*cpp;
975 	group_iter_t	i;
976 	pg_cpu_itr_t	cpu_itr;
977 	boolean_t	found;
978 
979 	ASSERT(MUTEX_HELD(&cpu_lock));
980 
981 	if (cmt_sched_disabled)
982 		return;
983 
984 	pgs = &cp->cpu_pg->pgs;
985 	group_iter_init(&i);
986 
987 	while ((pg = group_iterate(pgs, &i)) != NULL) {
988 
989 		if (IS_CMT_PG(pg) == 0)
990 			continue;
991 
992 		/*
993 		 * Remove the CPU from the CMT PGs active CPU group
994 		 * bitmap
995 		 */
996 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
997 		ASSERT(err == 0);
998 
999 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1000 
1001 		/*
1002 		 * If there are no more active CPUs in this PG over which
1003 		 * load was balanced, remove it as a balancing candidate.
1004 		 */
1005 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
1006 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1007 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1008 			ASSERT(err == 0);
1009 
1010 			if (pg->cmt_parent == NULL &&
1011 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1012 				err = group_remove(&cmt_root->cl_pgs, pg,
1013 				    GRP_NORESIZE);
1014 				ASSERT(err == 0);
1015 			}
1016 		}
1017 
1018 		/*
1019 		 * Assert the number of active CPUs does not exceed
1020 		 * the total number of CPUs in the PG
1021 		 */
1022 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1023 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1024 
1025 		/*
1026 		 * Update the PG bitset in the CPU's old partition
1027 		 */
1028 		found = B_FALSE;
1029 		PG_CPU_ITR_INIT(pg, cpu_itr);
1030 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1031 			if (cpp == cp)
1032 				continue;
1033 			if (CPU_ACTIVE(cpp) &&
1034 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1035 				found = B_TRUE;
1036 				break;
1037 			}
1038 		}
1039 		if (!found) {
1040 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
1041 			    ((pg_t *)pg)->pg_id);
1042 		}
1043 	}
1044 }
1045 
1046 /*
1047  * Return non-zero if the CPU belongs in the given PG
1048  */
1049 static int
1050 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1051 {
1052 	cpu_t	*pg_cpu;
1053 
1054 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1055 
1056 	ASSERT(pg_cpu != NULL);
1057 
1058 	/*
1059 	 * The CPU belongs if, given the nature of the hardware sharing
1060 	 * relationship represented by the PG, the CPU has that
1061 	 * relationship with some other CPU already in the PG
1062 	 */
1063 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1064 		return (1);
1065 
1066 	return (0);
1067 }
1068 
1069 /*
1070  * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1071  */
1072 static void
1073 pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1074 {
1075 	int		i, j, inc;
1076 	pg_t		*tmp;
1077 	pg_t		**h = (pg_t **)hier;
1078 
1079 	/*
1080 	 * First sort by number of CPUs
1081 	 */
1082 	inc = size / 2;
1083 	while (inc > 0) {
1084 		for (i = inc; i < size; i++) {
1085 			j = i;
1086 			tmp = h[i];
1087 			while ((j >= inc) &&
1088 			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
1089 				h[j] = h[j - inc];
1090 				j = j - inc;
1091 			}
1092 			h[j] = tmp;
1093 		}
1094 		if (inc == 2)
1095 			inc = 1;
1096 		else
1097 			inc = (inc * 5) / 11;
1098 	}
1099 
1100 	/*
1101 	 * Break ties by asking the platform.
1102 	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
1103 	 */
1104 	for (i = 0; i < size - 1; i++) {
1105 		if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) &&
1106 		    pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) {
1107 			tmp = h[i];
1108 			h[i] = h[i + 1];
1109 			h[i + 1] = tmp;
1110 		}
1111 	}
1112 }
1113 
1114 /*
1115  * Return a cmt_lgrp_t * given an lgroup handle.
1116  */
1117 static cmt_lgrp_t *
1118 pg_cmt_find_lgrp(lgrp_handle_t hand)
1119 {
1120 	cmt_lgrp_t	*lgrp;
1121 
1122 	ASSERT(MUTEX_HELD(&cpu_lock));
1123 
1124 	lgrp = cmt_lgrps;
1125 	while (lgrp != NULL) {
1126 		if (lgrp->cl_hand == hand)
1127 			break;
1128 		lgrp = lgrp->cl_next;
1129 	}
1130 	return (lgrp);
1131 }
1132 
1133 /*
1134  * Create a cmt_lgrp_t with the specified handle.
1135  */
1136 static cmt_lgrp_t *
1137 pg_cmt_lgrp_create(lgrp_handle_t hand)
1138 {
1139 	cmt_lgrp_t	*lgrp;
1140 
1141 	ASSERT(MUTEX_HELD(&cpu_lock));
1142 
1143 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1144 
1145 	lgrp->cl_hand = hand;
1146 	lgrp->cl_npgs = 0;
1147 	lgrp->cl_next = cmt_lgrps;
1148 	cmt_lgrps = lgrp;
1149 	group_create(&lgrp->cl_pgs);
1150 
1151 	return (lgrp);
1152 }
1153 
1154 /*
1155  * Interfaces to enable and disable power aware dispatching
1156  * The caller must be holding cpu_lock.
1157  *
1158  * Return 0 on success and -1 on failure.
1159  */
1160 int
1161 cmt_pad_enable(pghw_type_t type)
1162 {
1163 	group_t		*hwset;
1164 	group_iter_t	iter;
1165 	pg_cmt_t	*pg;
1166 
1167 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1168 	ASSERT(MUTEX_HELD(&cpu_lock));
1169 
1170 	if ((hwset = pghw_set_lookup(type)) == NULL ||
1171 	    cmt_hw_blacklisted[type]) {
1172 		/*
1173 		 * Unable to find any instances of the specified type
1174 		 * of power domain, or the power domains have been blacklisted.
1175 		 */
1176 		return (-1);
1177 	}
1178 
1179 	/*
1180 	 * Iterate over the power domains, setting the default dispatcher
1181 	 * policy for power/performance optimization.
1182 	 *
1183 	 * Simply setting the policy isn't enough in the case where the power
1184 	 * domain is an only child of another PG. Because the dispatcher walks
1185 	 * the PG hierarchy in a top down fashion, the higher up PG's policy
1186 	 * will dominate. So promote the power domain above it's parent if both
1187 	 * PG and it's parent have the same CPUs to ensure it's policy
1188 	 * dominates.
1189 	 */
1190 	group_iter_init(&iter);
1191 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1192 		/*
1193 		 * If the power domain is an only child to a parent
1194 		 * not implementing the same policy, promote the child
1195 		 * above the parent to activate the policy.
1196 		 */
1197 		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
1198 		while ((pg->cmt_parent != NULL) &&
1199 		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
1200 		    (PG_NUM_CPUS((pg_t *)pg) ==
1201 		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
1202 			cmt_hier_promote(pg);
1203 		}
1204 	}
1205 
1206 	return (0);
1207 }
1208 
1209 int
1210 cmt_pad_disable(pghw_type_t type)
1211 {
1212 	group_t		*hwset;
1213 	group_iter_t	iter;
1214 	pg_cmt_t	*pg;
1215 	pg_cmt_t	*child;
1216 
1217 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1218 	ASSERT(MUTEX_HELD(&cpu_lock));
1219 
1220 	if ((hwset = pghw_set_lookup(type)) == NULL) {
1221 		/*
1222 		 * Unable to find any instances of the specified type of
1223 		 * power domain.
1224 		 */
1225 		return (-1);
1226 	}
1227 	/*
1228 	 * Iterate over the power domains, setting the default dispatcher
1229 	 * policy for performance optimization (load balancing).
1230 	 */
1231 	group_iter_init(&iter);
1232 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1233 
1234 		/*
1235 		 * If the power domain has an only child that implements
1236 		 * policy other than load balancing, promote the child
1237 		 * above the power domain to ensure it's policy dominates.
1238 		 */
1239 		if (pg->cmt_children != NULL &&
1240 		    GROUP_SIZE(pg->cmt_children) == 1) {
1241 			child = GROUP_ACCESS(pg->cmt_children, 0);
1242 			if ((child->cmt_policy & CMT_BALANCE) == 0) {
1243 				cmt_hier_promote(child);
1244 			}
1245 		}
1246 		pg->cmt_policy = CMT_BALANCE;
1247 	}
1248 	return (0);
1249 }
1250 
1251 /* ARGSUSED */
1252 static void
1253 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1254 		    kthread_t *new)
1255 {
1256 	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;
1257 
1258 	if (old == cp->cpu_idle_thread) {
1259 		atomic_add_32(&cmt_pg->cmt_utilization, 1);
1260 	} else if (new == cp->cpu_idle_thread) {
1261 		atomic_add_32(&cmt_pg->cmt_utilization, -1);
1262 	}
1263 }
1264 
1265 /*
1266  * Macro to test whether a thread is currently runnable on a CPU in a PG.
1267  */
1268 #define	THREAD_RUNNABLE_IN_PG(t, pg)					\
1269 	((t)->t_state == TS_RUN &&					\
1270 	    (t)->t_disp_queue->disp_cpu &&				\
1271 	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
1272 	    (t)->t_disp_queue->disp_cpu->cpu_seqid))
1273 
1274 static void
1275 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1276     kthread_t *new)
1277 {
1278 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1279 	cpupm_domain_t	*dom;
1280 	uint32_t	u;
1281 
1282 	if (old == cp->cpu_idle_thread) {
1283 		ASSERT(new != cp->cpu_idle_thread);
1284 		u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
1285 		if (u == 1) {
1286 			/*
1287 			 * Notify the CPU power manager that the domain
1288 			 * is non-idle.
1289 			 */
1290 			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1291 			cpupm_utilization_event(cp, now, dom,
1292 			    CPUPM_DOM_BUSY_FROM_IDLE);
1293 		}
1294 	} else if (new == cp->cpu_idle_thread) {
1295 		ASSERT(old != cp->cpu_idle_thread);
1296 		u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
1297 		if (u == 0) {
1298 			/*
1299 			 * The domain is idle, notify the CPU power
1300 			 * manager.
1301 			 *
1302 			 * Avoid notifying if the thread is simply migrating
1303 			 * between CPUs in the domain.
1304 			 */
1305 			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
1306 				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1307 				cpupm_utilization_event(cp, now, dom,
1308 				    CPUPM_DOM_IDLE_FROM_BUSY);
1309 			}
1310 		}
1311 	}
1312 }
1313 
1314 /* ARGSUSED */
1315 static void
1316 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
1317 {
1318 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1319 	cpupm_domain_t	*dom;
1320 
1321 	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1322 	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
1323 }
1324 
1325 /*
1326  * Return the name of the CMT scheduling policy
1327  * being implemented across this PG
1328  */
1329 static char *
1330 pg_cmt_policy_name(pg_t *pg)
1331 {
1332 	pg_cmt_policy_t policy;
1333 
1334 	policy = ((pg_cmt_t *)pg)->cmt_policy;
1335 
1336 	if (policy & CMT_AFFINITY) {
1337 		if (policy & CMT_BALANCE)
1338 			return ("Load Balancing & Affinity");
1339 		else if (policy & CMT_COALESCE)
1340 			return ("Load Coalescence & Affinity");
1341 		else
1342 			return ("Affinity");
1343 	} else {
1344 		if (policy & CMT_BALANCE)
1345 			return ("Load Balancing");
1346 		else if (policy & CMT_COALESCE)
1347 			return ("Load Coalescence");
1348 		else
1349 			return ("None");
1350 	}
1351 }
1352 
1353 /*
1354  * Prune PG, and all other instances of PG's hardware sharing relationship
1355  * from the PG hierarchy.
1356  */
1357 static int
1358 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz)
1359 {
1360 	group_t		*hwset, *children;
1361 	int		i, j, r, size = *sz;
1362 	group_iter_t	hw_iter, child_iter;
1363 	pg_cpu_itr_t	cpu_iter;
1364 	pg_cmt_t	*pg, *child;
1365 	cpu_t		*cpu;
1366 	int		cap_needed;
1367 	pghw_type_t	hw;
1368 
1369 	ASSERT(MUTEX_HELD(&cpu_lock));
1370 
1371 	hw = ((pghw_t *)pg_bad)->pghw_hw;
1372 
1373 	if (hw == PGHW_POW_ACTIVE) {
1374 		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
1375 		    "Event Based CPUPM Unavailable");
1376 	} else if (hw == PGHW_POW_IDLE) {
1377 		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
1378 		    "Dispatcher assisted CPUPM disabled.");
1379 	}
1380 
1381 	/*
1382 	 * Find and eliminate the PG from the lineage.
1383 	 */
1384 	for (i = 0; i < size; i++) {
1385 		if (lineage[i] == pg_bad) {
1386 			for (j = i; j < size - 1; j++)
1387 				lineage[j] = lineage[j + 1];
1388 			*sz = size - 1;
1389 			break;
1390 		}
1391 	}
1392 
1393 	/*
1394 	 * We'll prune all instances of the hardware sharing relationship
1395 	 * represented by pg. But before we do that (and pause CPUs) we need
1396 	 * to ensure the hierarchy's groups are properly sized.
1397 	 */
1398 	hwset = pghw_set_lookup(hw);
1399 
1400 	/*
1401 	 * Blacklist the hardware so that future groups won't be created.
1402 	 */
1403 	cmt_hw_blacklisted[hw] = 1;
1404 
1405 	/*
1406 	 * For each of the PGs being pruned, ensure sufficient capacity in
1407 	 * the siblings set for the PG's children
1408 	 */
1409 	group_iter_init(&hw_iter);
1410 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1411 		/*
1412 		 * PG is being pruned, but if it is bringing up more than
1413 		 * one child, ask for more capacity in the siblings group.
1414 		 */
1415 		cap_needed = 0;
1416 		if (pg->cmt_children &&
1417 		    GROUP_SIZE(pg->cmt_children) > 1) {
1418 			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
1419 
1420 			group_expand(pg->cmt_siblings,
1421 			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);
1422 
1423 			/*
1424 			 * If this is a top level group, also ensure the
1425 			 * capacity in the root lgrp level CMT grouping.
1426 			 */
1427 			if (pg->cmt_parent == NULL &&
1428 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1429 				group_expand(&cmt_root->cl_pgs,
1430 				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1431 			}
1432 		}
1433 	}
1434 
1435 	/*
1436 	 * We're operating on the PG hierarchy. Pause CPUs to ensure
1437 	 * exclusivity with respect to the dispatcher.
1438 	 */
1439 	pause_cpus(NULL);
1440 
1441 	/*
1442 	 * Prune all PG instances of the hardware sharing relationship
1443 	 * represented by pg.
1444 	 */
1445 	group_iter_init(&hw_iter);
1446 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1447 
1448 		/*
1449 		 * Remove PG from it's group of siblings, if it's there.
1450 		 */
1451 		if (pg->cmt_siblings) {
1452 			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1453 		}
1454 		if (pg->cmt_parent == NULL &&
1455 		    pg->cmt_siblings != &cmt_root->cl_pgs) {
1456 			(void) group_remove(&cmt_root->cl_pgs, pg,
1457 			    GRP_NORESIZE);
1458 		}
1459 		/*
1460 		 * Move PG's children from it's children set to it's parent's
1461 		 * children set. Note that the parent's children set, and PG's
1462 		 * siblings set are the same thing.
1463 		 *
1464 		 * Because we are iterating over the same group that we are
1465 		 * operating on (removing the children), first add all of PG's
1466 		 * children to the parent's children set, and once we are done
1467 		 * iterating, empty PG's children set.
1468 		 */
1469 		if (pg->cmt_children != NULL) {
1470 			children = pg->cmt_children;
1471 
1472 			group_iter_init(&child_iter);
1473 			while ((child = group_iterate(children, &child_iter))
1474 			    != NULL) {
1475 				if (pg->cmt_siblings != NULL) {
1476 					r = group_add(pg->cmt_siblings, child,
1477 					    GRP_NORESIZE);
1478 					ASSERT(r == 0);
1479 				}
1480 			}
1481 			group_empty(pg->cmt_children);
1482 		}
1483 
1484 		/*
1485 		 * Reset the callbacks to the defaults
1486 		 */
1487 		pg_callback_set_defaults((pg_t *)pg);
1488 
1489 		/*
1490 		 * Update all the CPU lineages in each of PG's CPUs
1491 		 */
1492 		PG_CPU_ITR_INIT(pg, cpu_iter);
1493 		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1494 			group_t		*pgs;
1495 			pg_cmt_t	*cpu_pg;
1496 			group_iter_t	liter;	/* Iterator for the lineage */
1497 
1498 			/*
1499 			 * Iterate over the CPU's PGs updating the children
1500 			 * of the PG being promoted, since they have a new
1501 			 * parent and siblings set.
1502 			 */
1503 			pgs = &cpu->cpu_pg->pgs;
1504 			group_iter_init(&liter);
1505 			while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) {
1506 				if (cpu_pg->cmt_parent == pg) {
1507 					cpu_pg->cmt_parent = pg->cmt_parent;
1508 					cpu_pg->cmt_siblings = pg->cmt_siblings;
1509 				}
1510 			}
1511 
1512 			/*
1513 			 * Update the CPU's lineages
1514 			 */
1515 			pgs = &cpu->cpu_pg->cmt_pgs;
1516 			(void) group_remove(pgs, pg, GRP_NORESIZE);
1517 			pgs = &cpu->cpu_pg->pgs;
1518 			(void) group_remove(pgs, pg, GRP_NORESIZE);
1519 		}
1520 	}
1521 	start_cpus();
1522 	return (0);
1523 }
1524 
1525 /*
1526  * Disable CMT scheduling
1527  */
1528 static void
1529 pg_cmt_disable(void)
1530 {
1531 	cpu_t	*cpu;
1532 
1533 	pause_cpus(NULL);
1534 	cpu = cpu_list;
1535 
1536 	do {
1537 		if (cpu->cpu_pg)
1538 			group_empty(&cpu->cpu_pg->cmt_pgs);
1539 	} while ((cpu = cpu->cpu_next) != cpu_list);
1540 
1541 	cmt_sched_disabled = 1;
1542 	start_cpus();
1543 	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
1544 }
1545 
1546 /*
1547  * CMT lineage validation
1548  *
1549  * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
1550  * of the PGs in a CPU's lineage. This is necessary because it's possible that
1551  * some groupings (power domain groupings in particular) may be defined by
1552  * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
1553  * possible to integrate those groupings into the CMT PG hierarchy, if doing
1554  * so would violate the subset invariant of the hierarchy, which says that
1555  * a PG must be subset of its parent (if it has one).
1556  *
1557  * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
1558  * would result in a violation of this invariant. If a violation is found,
1559  * and the PG is of a grouping type who's definition is known to originate from
1560  * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
1561  * PG (and all other instances PG's sharing relationship type) from the
1562  * hierarchy. Further, future instances of that sharing relationship type won't
1563  * be instantiated. If the grouping definition doesn't originate from suspect
1564  * sources, then pg_cmt_disable() will be invoked to log an error, and disable
1565  * CMT scheduling altogether.
1566  *
1567  * This routine is invoked after the CPU has been added to the PGs in which
1568  * it belongs, but before those PGs have been added to (or had their place
1569  * adjusted in) the CMT PG hierarchy.
1570  *
1571  * The first argument is the CPUs PG lineage (essentially an array of PGs in
1572  * which the CPU belongs) that has already been sorted in ascending order
1573  * by CPU count. Some of the PGs in the CPUs lineage may already have other
1574  * CPUs in them, and have already been integrated into the CMT hierarchy.
1575  *
1576  * The addition of this new CPU to these pre-existing PGs means that those
1577  * PGs may need to be promoted up in the hierarchy to satisfy the subset
1578  * invariant. In additon to testing the subset invariant for the lineage,
1579  * this routine also verifies that the addition of the new CPU to the
1580  * existing PGs wouldn't cause the subset invariant to be violated in
1581  * the exiting lineages.
1582  *
1583  * This routine will normally return one of the following:
1584  * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
1585  * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
1586  *
1587  * Otherwise, this routine will return a value indicating which error it
1588  * was unable to recover from (and set cmt_lineage_status along the way).
1589  */
1590 static cmt_lineage_validation_t
1591 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz)
1592 {
1593 	int		i, j, size;
1594 	pg_cmt_t	*pg, *pg_next, *pg_bad, *pg_tmp;
1595 	cpu_t		*cp;
1596 	pg_cpu_itr_t	cpu_iter;
1597 	lgrp_handle_t	lgrp;
1598 
1599 	ASSERT(MUTEX_HELD(&cpu_lock));
1600 
1601 revalidate:
1602 	size = *sz;
1603 	pg_bad = NULL;
1604 	lgrp = LGRP_NULL_HANDLE;
1605 	for (i = 0; i < size; i++) {
1606 
1607 		pg = lineage[i];
1608 		if (i < size - 1)
1609 			pg_next = lineage[i + 1];
1610 		else
1611 			pg_next = NULL;
1612 
1613 		/*
1614 		 * We assume that the lineage has already been sorted
1615 		 * by the number of CPUs. In fact, we depend on it.
1616 		 */
1617 		ASSERT(pg_next == NULL ||
1618 		    (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));
1619 
1620 		/*
1621 		 * Check to make sure that the existing parent of PG (if any)
1622 		 * is either in the PG's lineage, or the PG has more CPUs than
1623 		 * its existing parent and can and should be promoted above its
1624 		 * parent.
1625 		 *
1626 		 * Since the PG topology is in the middle of being changed, we
1627 		 * need to check whether the PG's existing parent (if any) is
1628 		 * part of its lineage (and therefore should contain the new
1629 		 * CPU). If not, it means that the addition of the new CPU
1630 		 * should have made this PG have more CPUs than its parent, and
1631 		 * this PG should be promoted to be above its existing parent
1632 		 * now. We need to verify all of this to defend against a buggy
1633 		 * BIOS giving bad power domain CPU groupings. Sigh.
1634 		 */
1635 		if (pg->cmt_parent) {
1636 			/*
1637 			 * Determine if cmt_parent is in this lineage
1638 			 */
1639 			for (j = 0; j < size; j++) {
1640 				pg_tmp = lineage[j];
1641 				if (pg_tmp == pg->cmt_parent)
1642 					break;
1643 			}
1644 			if (pg_tmp != pg->cmt_parent) {
1645 				/*
1646 				 * cmt_parent is not in the lineage, verify
1647 				 * it is a proper subset of PG.
1648 				 */
1649 				if (PG_NUM_CPUS((pg_t *)pg->cmt_parent) >=
1650 				    PG_NUM_CPUS((pg_t *)pg)) {
1651 					/*
1652 					 * Not a proper subset if pg has less
1653 					 * CPUs than cmt_parent...
1654 					 */
1655 					cmt_lineage_status =
1656 					    CMT_LINEAGE_NON_PROMOTABLE;
1657 					goto handle_error;
1658 				}
1659 			}
1660 		}
1661 
1662 		/*
1663 		 * Walk each of the CPUs in the PGs group and perform
1664 		 * consistency checks along the way.
1665 		 */
1666 		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
1667 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1668 			/*
1669 			 * Verify that there aren't any CPUs contained in PG
1670 			 * that the next PG in the lineage (which is larger
1671 			 * or same size) doesn't also contain.
1672 			 */
1673 			if (pg_next != NULL &&
1674 			    pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
1675 				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
1676 				goto handle_error;
1677 			}
1678 
1679 			/*
1680 			 * Verify that all the CPUs in the PG are in the same
1681 			 * lgroup.
1682 			 */
1683 			if (lgrp == LGRP_NULL_HANDLE) {
1684 				lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
1685 			} else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
1686 				cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
1687 				goto handle_error;
1688 			}
1689 		}
1690 	}
1691 
1692 handle_error:
1693 	/*
1694 	 * Some of these validation errors can result when the CPU grouping
1695 	 * information is derived from buggy sources (for example, incorrect
1696 	 * ACPI tables on x86 systems).
1697 	 *
1698 	 * We'll try to recover in such cases by pruning out the illegal
1699 	 * groupings from the PG hierarchy, which means that we won't optimize
1700 	 * for those levels, but we will for the remaining ones.
1701 	 */
1702 	switch (cmt_lineage_status) {
1703 	case CMT_LINEAGE_VALID:
1704 	case CMT_LINEAGE_REPAIRED:
1705 		break;
1706 	case CMT_LINEAGE_PG_SPANS_LGRPS:
1707 		/*
1708 		 * We've detected a PG whose CPUs span lgroups.
1709 		 *
1710 		 * This isn't supported, as the dispatcher isn't allowed to
1711 		 * to do CMT thread placement across lgroups, as this would
1712 		 * conflict with policies implementing MPO thread affinity.
1713 		 *
1714 		 * The handling for this falls through to the next case.
1715 		 */
1716 	case CMT_LINEAGE_NON_PROMOTABLE:
1717 		/*
1718 		 * We've detected a PG that already exists in another CPU's
1719 		 * lineage that cannot cannot legally be promoted into place
1720 		 * without breaking the invariants of the hierarchy.
1721 		 */
1722 		if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1723 			if (pg_cmt_prune(pg, lineage, sz) == 0) {
1724 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1725 				goto revalidate;
1726 			}
1727 		}
1728 		/*
1729 		 * Something went wrong trying to prune out the bad level.
1730 		 * Disable CMT scheduling altogether.
1731 		 */
1732 		pg_cmt_disable();
1733 		break;
1734 	case CMT_LINEAGE_NON_CONCENTRIC:
1735 		/*
1736 		 * We've detected a non-concentric PG lineage, which means that
1737 		 * there's a PG in the lineage that has CPUs that the next PG
1738 		 * over in the lineage (which is the same size or larger)
1739 		 * doesn't have.
1740 		 *
1741 		 * In this case, we examine the two PGs to see if either
1742 		 * grouping is defined by potentially buggy sources.
1743 		 *
1744 		 * If one has less CPUs than the other, and contains CPUs
1745 		 * not found in the parent, and it is an untrusted enumeration,
1746 		 * then prune it. If both have the same number of CPUs, then
1747 		 * prune the one that is untrusted.
1748 		 *
1749 		 * This process repeats until we have a concentric lineage,
1750 		 * or we would have to prune out level derived from what we
1751 		 * thought was a reliable source, in which case CMT scheduling
1752 		 * is disabled altogether.
1753 		 */
1754 		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
1755 		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
1756 			pg_bad = pg;
1757 		} else if (PG_NUM_CPUS((pg_t *)pg) ==
1758 		    PG_NUM_CPUS((pg_t *)pg_next)) {
1759 			if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
1760 				pg_bad = pg_next;
1761 			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1762 				pg_bad = pg;
1763 			}
1764 		}
1765 		if (pg_bad) {
1766 			if (pg_cmt_prune(pg_bad, lineage, sz) == 0) {
1767 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1768 				goto revalidate;
1769 			}
1770 		}
1771 		/*
1772 		 * Something went wrong trying to identify and/or prune out
1773 		 * the bad level. Disable CMT scheduling altogether.
1774 		 */
1775 		pg_cmt_disable();
1776 		break;
1777 	default:
1778 		/*
1779 		 * If we're here, we've encountered a validation error for
1780 		 * which we don't know how to recover. In this case, disable
1781 		 * CMT scheduling altogether.
1782 		 */
1783 		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1784 		pg_cmt_disable();
1785 	}
1786 	return (cmt_lineage_status);
1787 }
1788