xref: /titanic_50/usr/src/uts/common/disp/cmt.c (revision 81ea8c75ed90773d7d2d2dcf7919ec44e9fd4119)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/systm.h>
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/thread.h>
32 #include <sys/cpuvar.h>
33 #include <sys/cpupart.h>
34 #include <sys/kmem.h>
35 #include <sys/cmn_err.h>
36 #include <sys/kstat.h>
37 #include <sys/processor.h>
38 #include <sys/disp.h>
39 #include <sys/group.h>
40 #include <sys/pghw.h>
41 #include <sys/bitset.h>
42 #include <sys/lgrp.h>
43 #include <sys/cmt.h>
44 
45 /*
46  * CMT scheduler / dispatcher support
47  *
48  * This file implements CMT scheduler support using Processor Groups.
49  * The CMT processor group class creates and maintains the CMT class
50  * specific processor group pg_cmt_t.
51  *
52  * ---------------------------- <-- pg_cmt_t *
53  * | pghw_t                   |
54  * ----------------------------
55  * | CMT class specific data  |
56  * | - hierarchy linkage      |
57  * | - CMT load balancing data|
58  * | - active CPU group/bitset|
59  * ----------------------------
60  *
61  * The scheduler/dispatcher leverages knowledge of the performance
62  * relevant CMT sharing relationships existing between cpus to implement
63  * optimized affinity and load balancing policies.
64  *
65  * Load balancing policy seeks to improve performance by minimizing
66  * contention over shared processor resources / facilities, while the
67  * affinity policies seek to improve cache and TLB utilization.
68  *
69  * The CMT PGs created by this class are already arranged into a
70  * hierarchy (which is done in the pghw layer). To implement the top-down
71  * CMT load balancing algorithm, the CMT PGs additionally maintain
72  * parent, child and sibling hierarchy relationships.
73  * Parent PGs always contain a superset of their children(s) resources,
74  * each PG can have at most one parent, and siblings are the group of PGs
75  * sharing the same parent.
76  *
77  * On NUMA systems, the CMT load balancing algorithm balances across the
78  * CMT PGs within their respective lgroups. On UMA based system, there
79  * exists a top level group of PGs to balance across. On NUMA systems multiple
80  * top level groups are instantiated, where the top level balancing begins by
81  * balancng across the CMT PGs within their respective (per lgroup) top level
82  * groups.
83  */
84 
85 typedef struct cmt_lgrp {
86 	group_t		cl_pgs;		/* Top level group of active CMT PGs */
87 	int		cl_npgs;	/* # of top level PGs in the lgroup */
88 	lgrp_handle_t	cl_hand;	/* lgroup's platform handle */
89 	struct cmt_lgrp *cl_next;	/* next cmt_lgrp */
90 } cmt_lgrp_t;
91 
92 static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
93 static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
94 						/* used for null_proc_lpa */
95 
96 static int		is_cpu0 = 1; /* true if this is boot CPU context */
97 
98 /*
99  * Set this to non-zero to disable CMT scheduling
100  * This must be done via kmdb -d, as /etc/system will be too late
101  */
102 static int		cmt_sched_disabled = 0;
103 
104 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
105 
106 static pg_t		*pg_cmt_alloc();
107 static void		pg_cmt_free(pg_t *);
108 static void		pg_cmt_cpu_init(cpu_t *);
109 static void		pg_cmt_cpu_fini(cpu_t *);
110 static void		pg_cmt_cpu_active(cpu_t *);
111 static void		pg_cmt_cpu_inactive(cpu_t *);
112 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
113 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
114 static void		pg_cmt_hier_pack(pg_cmt_t **, int);
115 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
116 static int		pg_cmt_hw(pghw_type_t);
117 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
118 static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
119 
120 /*
121  * Macro to test if PG is managed by the CMT PG class
122  */
123 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
124 
125 /*
126  * CMT PG ops
127  */
128 struct pg_ops pg_ops_cmt = {
129 	pg_cmt_alloc,
130 	pg_cmt_free,
131 	pg_cmt_cpu_init,
132 	pg_cmt_cpu_fini,
133 	pg_cmt_cpu_active,
134 	pg_cmt_cpu_inactive,
135 	pg_cmt_cpupart_in,
136 	NULL,			/* cpupart_out */
137 	pg_cmt_cpupart_move,
138 	pg_cmt_cpu_belongs,
139 };
140 
141 /*
142  * Initialize the CMT PG class
143  */
144 void
145 pg_cmt_class_init(void)
146 {
147 	if (cmt_sched_disabled)
148 		return;
149 
150 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
151 }
152 
153 /*
154  * Called to indicate a new CPU has started up so
155  * that either t0 or the slave startup thread can
156  * be accounted for.
157  */
158 void
159 pg_cmt_cpu_startup(cpu_t *cp)
160 {
161 	PG_NRUN_UPDATE(cp, 1);
162 }
163 
164 /*
165  * Adjust the CMT load in the CMT PGs in which the CPU belongs
166  * Note that "n" can be positive in the case of increasing
167  * load, or negative in the case of decreasing load.
168  */
169 void
170 pg_cmt_load(cpu_t *cp, int n)
171 {
172 	pg_cmt_t	*pg;
173 
174 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
175 	while (pg != NULL) {
176 		ASSERT(IS_CMT_PG(pg));
177 		atomic_add_32(&pg->cmt_nrunning, n);
178 		pg = pg->cmt_parent;
179 	}
180 }
181 
182 /*
183  * Return non-zero if thread can migrate between "from" and "to"
184  * without a performance penalty
185  */
186 int
187 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
188 {
189 	if (from->cpu_physid->cpu_cacheid ==
190 	    to->cpu_physid->cpu_cacheid)
191 		return (1);
192 	return (0);
193 }
194 
195 /*
196  * CMT class specific PG allocation
197  */
198 static pg_t *
199 pg_cmt_alloc(void)
200 {
201 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
202 }
203 
204 /*
205  * Class specific PG de-allocation
206  */
207 static void
208 pg_cmt_free(pg_t *pg)
209 {
210 	ASSERT(pg != NULL);
211 	ASSERT(IS_CMT_PG(pg));
212 
213 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
214 }
215 
216 /*
217  * Return 1 if CMT load balancing policies should be
218  * implemented across instances of the specified hardware
219  * sharing relationship.
220  */
221 static int
222 pg_cmt_load_bal_hw(pghw_type_t hw)
223 {
224 	if (hw == PGHW_IPIPE ||
225 	    hw == PGHW_FPU ||
226 	    hw == PGHW_CHIP)
227 		return (1);
228 	else
229 		return (0);
230 }
231 
232 /*
233  * Return 1 if thread affinity polices should be implemented
234  * for instances of the specifed hardware sharing relationship.
235  */
236 static int
237 pg_cmt_affinity_hw(pghw_type_t hw)
238 {
239 	if (hw == PGHW_CACHE)
240 		return (1);
241 	else
242 		return (0);
243 }
244 
245 /*
246  * Return 1 if CMT scheduling policies should be impelmented
247  * for the specified hardware sharing relationship.
248  */
249 static int
250 pg_cmt_hw(pghw_type_t hw)
251 {
252 	return (pg_cmt_load_bal_hw(hw) ||
253 	    pg_cmt_affinity_hw(hw));
254 }
255 
256 /*
257  * CMT class callback for a new CPU entering the system
258  */
259 static void
260 pg_cmt_cpu_init(cpu_t *cp)
261 {
262 	pg_cmt_t	*pg;
263 	group_t		*cmt_pgs;
264 	int		level, max_level, nlevels;
265 	pghw_type_t	hw;
266 	pg_t		*pg_cache = NULL;
267 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
268 	lgrp_handle_t	lgrp_handle;
269 	cmt_lgrp_t	*lgrp;
270 
271 	ASSERT(MUTEX_HELD(&cpu_lock));
272 
273 	/*
274 	 * A new CPU is coming into the system.
275 	 * Interrogate the platform to see if the CPU
276 	 * has any performance relevant CMT sharing
277 	 * relationships
278 	 */
279 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
280 	cp->cpu_pg->cmt_lineage = NULL;
281 
282 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
283 	max_level = nlevels = 0;
284 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
285 
286 		/*
287 		 * We're only interested in CMT hw sharing relationships
288 		 */
289 		if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0)
290 			continue;
291 
292 		/*
293 		 * Find (or create) the PG associated with
294 		 * the hw sharing relationship in which cp
295 		 * belongs.
296 		 *
297 		 * Determine if a suitable PG already
298 		 * exists, or if one needs to be created.
299 		 */
300 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
301 		if (pg == NULL) {
302 			/*
303 			 * Create a new one.
304 			 * Initialize the common...
305 			 */
306 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
307 
308 			/* ... physical ... */
309 			pghw_init((pghw_t *)pg, cp, hw);
310 
311 			/*
312 			 * ... and CMT specific portions of the
313 			 * structure.
314 			 */
315 			bitset_init(&pg->cmt_cpus_actv_set);
316 			group_create(&pg->cmt_cpus_actv);
317 		} else {
318 			ASSERT(IS_CMT_PG(pg));
319 		}
320 
321 		/* Add the CPU to the PG */
322 		pg_cpu_add((pg_t *)pg, cp);
323 
324 		/*
325 		 * Ensure capacity of the active CPUs group/bitset
326 		 */
327 		group_expand(&pg->cmt_cpus_actv,
328 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
329 
330 		if (cp->cpu_seqid >=
331 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
332 			bitset_resize(&pg->cmt_cpus_actv_set,
333 			    cp->cpu_seqid + 1);
334 		}
335 
336 		/*
337 		 * Build a lineage of CMT PGs for load balancing
338 		 */
339 		if (pg_cmt_load_bal_hw(hw)) {
340 			level = pghw_level(hw);
341 			cpu_cmt_hier[level] = pg;
342 			if (level > max_level)
343 				max_level = level;
344 			nlevels++;
345 		}
346 
347 		/* Cache this for later */
348 		if (hw == PGHW_CACHE)
349 			pg_cache = (pg_t *)pg;
350 	}
351 
352 	/*
353 	 * Pack out any gaps in the constructed lineage.
354 	 * Gaps may exist where the architecture knows
355 	 * about a hardware sharing relationship, but such a
356 	 * relationship either isn't relevant for load
357 	 * balancing or doesn't exist between CPUs on the system.
358 	 */
359 	pg_cmt_hier_pack(cpu_cmt_hier, max_level + 1);
360 
361 	/*
362 	 * For each of the PGs int the CPU's lineage:
363 	 *	- Add an entry in the CPU sorted CMT PG group
364 	 *	  which is used for top down CMT load balancing
365 	 *	- Tie the PG into the CMT hierarchy by connecting
366 	 *	  it to it's parent and siblings.
367 	 */
368 	group_expand(cmt_pgs, nlevels);
369 
370 	/*
371 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
372 	 */
373 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
374 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
375 	if (lgrp == NULL)
376 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
377 
378 	for (level = 0; level < nlevels; level++) {
379 		uint_t		children;
380 		int		err;
381 
382 		pg = cpu_cmt_hier[level];
383 		err = group_add_at(cmt_pgs, pg, nlevels - level - 1);
384 		ASSERT(err == 0);
385 
386 		if (level == 0)
387 			cp->cpu_pg->cmt_lineage = (pg_t *)pg;
388 
389 		if (pg->cmt_siblings != NULL) {
390 			/* Already initialized */
391 			ASSERT(pg->cmt_parent == NULL ||
392 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
393 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
394 			    pg->cmt_siblings == pg->cmt_parent->cmt_children);
395 			continue;
396 		}
397 
398 		if ((level + 1) == nlevels) {
399 			pg->cmt_parent = NULL;
400 			pg->cmt_siblings = &lgrp->cl_pgs;
401 			children = ++lgrp->cl_npgs;
402 		} else {
403 			pg->cmt_parent = cpu_cmt_hier[level + 1];
404 
405 			/*
406 			 * A good parent keeps track of their children.
407 			 * The parent's children group is also the PG's
408 			 * siblings.
409 			 */
410 			if (pg->cmt_parent->cmt_children == NULL) {
411 				pg->cmt_parent->cmt_children =
412 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
413 				group_create(pg->cmt_parent->cmt_children);
414 			}
415 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
416 			children = ++pg->cmt_parent->cmt_nchildren;
417 		}
418 		pg->cmt_hint = 0;
419 		group_expand(pg->cmt_siblings, children);
420 	}
421 
422 	/*
423 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
424 	 * for fast lookups later.
425 	 */
426 	if (cp->cpu_physid) {
427 		cp->cpu_physid->cpu_chipid =
428 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
429 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
430 
431 		/*
432 		 * If this cpu has a PG representing shared cache, then set
433 		 * cpu_cacheid to that PG's logical id
434 		 */
435 		if (pg_cache)
436 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
437 	}
438 
439 	/* CPU0 only initialization */
440 	if (is_cpu0) {
441 		pg_cmt_cpu_startup(cp);
442 		is_cpu0 = 0;
443 		cpu0_lgrp = lgrp;
444 	}
445 
446 }
447 
448 /*
449  * Class callback when a CPU is leaving the system (deletion)
450  */
451 static void
452 pg_cmt_cpu_fini(cpu_t *cp)
453 {
454 	group_iter_t	i;
455 	pg_cmt_t	*pg;
456 	group_t		*pgs, *cmt_pgs;
457 	lgrp_handle_t	lgrp_handle;
458 	cmt_lgrp_t	*lgrp;
459 
460 	pgs = &cp->cpu_pg->pgs;
461 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
462 
463 	/*
464 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
465 	 */
466 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
467 
468 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
469 	if (lgrp == NULL) {
470 		/*
471 		 * This is a bit of a special case.
472 		 * The only way this can happen is if the CPU's lgrp
473 		 * handle changed out from underneath us, which is what
474 		 * happens with null_proc_lpa on starcat systems.
475 		 *
476 		 * Use the initial boot CPU lgrp, since this is what
477 		 * we need to tear down.
478 		 */
479 		lgrp = cpu0_lgrp;
480 	}
481 
482 	/*
483 	 * First, clean up anything load balancing specific for each of
484 	 * the CPU's PGs that participated in CMT load balancing
485 	 */
486 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
487 	while (pg != NULL) {
488 
489 		/*
490 		 * Remove the PG from the CPU's load balancing lineage
491 		 */
492 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
493 
494 		/*
495 		 * If it's about to become empty, destroy it's children
496 		 * group, and remove it's reference from it's siblings.
497 		 * This is done here (rather than below) to avoid removing
498 		 * our reference from a PG that we just eliminated.
499 		 */
500 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
501 			if (pg->cmt_children != NULL)
502 				group_destroy(pg->cmt_children);
503 			if (pg->cmt_siblings != NULL) {
504 				if (pg->cmt_siblings == &lgrp->cl_pgs)
505 					lgrp->cl_npgs--;
506 				else
507 					pg->cmt_parent->cmt_nchildren--;
508 			}
509 		}
510 		pg = pg->cmt_parent;
511 	}
512 
513 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
514 
515 	/*
516 	 * Now that the load balancing lineage updates have happened,
517 	 * remove the CPU from all it's PGs (destroying any that become
518 	 * empty).
519 	 */
520 	group_iter_init(&i);
521 	while ((pg = group_iterate(pgs, &i)) != NULL) {
522 		if (IS_CMT_PG(pg) == 0)
523 			continue;
524 
525 		pg_cpu_delete((pg_t *)pg, cp);
526 		/*
527 		 * Deleting the CPU from the PG changes the CPU's
528 		 * PG group over which we are actively iterating
529 		 * Re-initialize the iteration
530 		 */
531 		group_iter_init(&i);
532 
533 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
534 
535 			/*
536 			 * The PG has become zero sized, so destroy it.
537 			 */
538 			group_destroy(&pg->cmt_cpus_actv);
539 			bitset_fini(&pg->cmt_cpus_actv_set);
540 			pghw_fini((pghw_t *)pg);
541 
542 			pg_destroy((pg_t *)pg);
543 		}
544 	}
545 }
546 
547 /*
548  * Class callback when a CPU is entering a cpu partition
549  */
550 static void
551 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
552 {
553 	group_t		*pgs;
554 	pg_t		*pg;
555 	group_iter_t	i;
556 
557 	ASSERT(MUTEX_HELD(&cpu_lock));
558 
559 	pgs = &cp->cpu_pg->pgs;
560 
561 	/*
562 	 * Ensure that the new partition's PG bitset
563 	 * is large enough for all CMT PG's to which cp
564 	 * belongs
565 	 */
566 	group_iter_init(&i);
567 	while ((pg = group_iterate(pgs, &i)) != NULL) {
568 		if (IS_CMT_PG(pg) == 0)
569 			continue;
570 
571 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
572 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
573 	}
574 }
575 
576 /*
577  * Class callback when a CPU is actually moving partitions
578  */
579 static void
580 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
581 {
582 	cpu_t		*cpp;
583 	group_t		*pgs;
584 	pg_t		*pg;
585 	group_iter_t	pg_iter;
586 	pg_cpu_itr_t	cpu_iter;
587 	boolean_t	found;
588 
589 	ASSERT(MUTEX_HELD(&cpu_lock));
590 
591 	pgs = &cp->cpu_pg->pgs;
592 	group_iter_init(&pg_iter);
593 
594 	/*
595 	 * Iterate over the CPUs CMT PGs
596 	 */
597 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
598 
599 		if (IS_CMT_PG(pg) == 0)
600 			continue;
601 
602 		/*
603 		 * Add the PG to the bitset in the new partition.
604 		 */
605 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
606 
607 		/*
608 		 * Remove the PG from the bitset in the old partition
609 		 * if the last of the PG's CPUs have left.
610 		 */
611 		found = B_FALSE;
612 		PG_CPU_ITR_INIT(pg, cpu_iter);
613 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
614 			if (cpp == cp)
615 				continue;
616 			if (CPU_ACTIVE(cpp) &&
617 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
618 				found = B_TRUE;
619 				break;
620 			}
621 		}
622 		if (!found)
623 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
624 	}
625 }
626 
627 /*
628  * Class callback when a CPU becomes active (online)
629  *
630  * This is called in a context where CPUs are paused
631  */
632 static void
633 pg_cmt_cpu_active(cpu_t *cp)
634 {
635 	int		err;
636 	group_iter_t	i;
637 	pg_cmt_t	*pg;
638 	group_t		*pgs;
639 
640 	ASSERT(MUTEX_HELD(&cpu_lock));
641 
642 	pgs = &cp->cpu_pg->pgs;
643 	group_iter_init(&i);
644 
645 	/*
646 	 * Iterate over the CPU's PGs
647 	 */
648 	while ((pg = group_iterate(pgs, &i)) != NULL) {
649 
650 		if (IS_CMT_PG(pg) == 0)
651 			continue;
652 
653 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
654 		ASSERT(err == 0);
655 
656 		/*
657 		 * If this is the first active CPU in the PG, and it
658 		 * represents a hardware sharing relationship over which
659 		 * CMT load balancing is performed, add it as a candidate
660 		 * for balancing with it's siblings.
661 		 */
662 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
663 		    pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
664 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
665 			ASSERT(err == 0);
666 		}
667 
668 		/*
669 		 * Notate the CPU in the PGs active CPU bitset.
670 		 * Also notate the PG as being active in it's associated
671 		 * partition
672 		 */
673 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
674 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
675 	}
676 }
677 
678 /*
679  * Class callback when a CPU goes inactive (offline)
680  *
681  * This is called in a context where CPUs are paused
682  */
683 static void
684 pg_cmt_cpu_inactive(cpu_t *cp)
685 {
686 	int		err;
687 	group_t		*pgs;
688 	pg_cmt_t	*pg;
689 	cpu_t		*cpp;
690 	group_iter_t	i;
691 	pg_cpu_itr_t	cpu_itr;
692 	boolean_t	found;
693 
694 	ASSERT(MUTEX_HELD(&cpu_lock));
695 
696 	pgs = &cp->cpu_pg->pgs;
697 	group_iter_init(&i);
698 
699 	while ((pg = group_iterate(pgs, &i)) != NULL) {
700 
701 		if (IS_CMT_PG(pg) == 0)
702 			continue;
703 
704 		/*
705 		 * Remove the CPU from the CMT PGs active CPU group
706 		 * bitmap
707 		 */
708 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
709 		ASSERT(err == 0);
710 
711 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
712 
713 		/*
714 		 * If there are no more active CPUs in this PG over which
715 		 * load was balanced, remove it as a balancing candidate.
716 		 */
717 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
718 		    pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
719 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
720 			ASSERT(err == 0);
721 		}
722 
723 		/*
724 		 * Assert the number of active CPUs does not exceed
725 		 * the total number of CPUs in the PG
726 		 */
727 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
728 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
729 
730 		/*
731 		 * Update the PG bitset in the CPU's old partition
732 		 */
733 		found = B_FALSE;
734 		PG_CPU_ITR_INIT(pg, cpu_itr);
735 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
736 			if (cpp == cp)
737 				continue;
738 			if (CPU_ACTIVE(cpp) &&
739 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
740 				found = B_TRUE;
741 				break;
742 			}
743 		}
744 		if (!found) {
745 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
746 			    ((pg_t *)pg)->pg_id);
747 		}
748 	}
749 }
750 
751 /*
752  * Return non-zero if the CPU belongs in the given PG
753  */
754 static int
755 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
756 {
757 	cpu_t	*pg_cpu;
758 
759 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
760 
761 	ASSERT(pg_cpu != NULL);
762 
763 	/*
764 	 * The CPU belongs if, given the nature of the hardware sharing
765 	 * relationship represented by the PG, the CPU has that
766 	 * relationship with some other CPU already in the PG
767 	 */
768 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
769 		return (1);
770 
771 	return (0);
772 }
773 
774 /*
775  * Pack the CPUs CMT hierarchy
776  * The hierarchy order is preserved
777  */
778 static void
779 pg_cmt_hier_pack(pg_cmt_t *hier[], int sz)
780 {
781 	int	i, j;
782 
783 	for (i = 0; i < sz; i++) {
784 		if (hier[i] != NULL)
785 			continue;
786 
787 		for (j = i; j < sz; j++) {
788 			if (hier[j] != NULL) {
789 				hier[i] = hier[j];
790 				hier[j] = NULL;
791 				break;
792 			}
793 		}
794 		if (j == sz)
795 			break;
796 	}
797 }
798 
799 /*
800  * Return a cmt_lgrp_t * given an lgroup handle.
801  */
802 static cmt_lgrp_t *
803 pg_cmt_find_lgrp(lgrp_handle_t hand)
804 {
805 	cmt_lgrp_t	*lgrp;
806 
807 	ASSERT(MUTEX_HELD(&cpu_lock));
808 
809 	lgrp = cmt_lgrps;
810 	while (lgrp != NULL) {
811 		if (lgrp->cl_hand == hand)
812 			break;
813 		lgrp = lgrp->cl_next;
814 	}
815 	return (lgrp);
816 }
817 
818 /*
819  * Create a cmt_lgrp_t with the specified handle.
820  */
821 static cmt_lgrp_t *
822 pg_cmt_lgrp_create(lgrp_handle_t hand)
823 {
824 	cmt_lgrp_t	*lgrp;
825 
826 	ASSERT(MUTEX_HELD(&cpu_lock));
827 
828 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
829 
830 	lgrp->cl_hand = hand;
831 	lgrp->cl_npgs = 0;
832 	lgrp->cl_next = cmt_lgrps;
833 	cmt_lgrps = lgrp;
834 	group_create(&lgrp->cl_pgs);
835 
836 	return (lgrp);
837 }
838