xref: /illumos-gate/usr/src/uts/common/disp/cmt.c (revision 150d2c5288c645a1c1a7d2bee61199a3729406c7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/systm.h>
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/thread.h>
32 #include <sys/cpuvar.h>
33 #include <sys/cpupart.h>
34 #include <sys/kmem.h>
35 #include <sys/cmn_err.h>
36 #include <sys/kstat.h>
37 #include <sys/processor.h>
38 #include <sys/disp.h>
39 #include <sys/group.h>
40 #include <sys/pghw.h>
41 #include <sys/bitset.h>
42 #include <sys/lgrp.h>
43 #include <sys/cmt.h>
44 
45 /*
46  * CMT scheduler / dispatcher support
47  *
48  * This file implements CMT scheduler support using Processor Groups.
49  * The CMT processor group class creates and maintains the CMT class
50  * specific processor group pg_cmt_t.
51  *
52  * ---------------------------- <-- pg_cmt_t *
53  * | pghw_t                   |
54  * ----------------------------
55  * | CMT class specific data  |
56  * | - hierarchy linkage      |
57  * | - CMT load balancing data|
58  * | - active CPU group/bitset|
59  * ----------------------------
60  *
61  * The scheduler/dispatcher leverages knowledge of the performance
62  * relevant CMT sharing relationships existing between cpus to implement
63  * optimized affinity and load balancing policies.
64  *
65  * Load balancing policy seeks to improve performance by minimizing
66  * contention over shared processor resources / facilities, while the
67  * affinity policies seek to improve cache and TLB utilization.
68  *
69  * The CMT PGs created by this class are already arranged into a
70  * hierarchy (which is done in the pghw layer). To implement the top-down
71  * CMT load balancing algorithm, the CMT PGs additionally maintain
72  * parent, child and sibling hierarchy relationships.
73  * Parent PGs always contain a superset of their children(s) resources,
74  * each PG can have at most one parent, and siblings are the group of PGs
75  * sharing the same parent.
76  *
77  * On NUMA systems, the CMT load balancing algorithm balances across the
78  * CMT PGs within their respective lgroups. On UMA based system, there
79  * exists a top level group of PGs to balance across. On NUMA systems multiple
80  * top level groups are instantiated, where the top level balancing begins by
81  * balancng across the CMT PGs within their respective (per lgroup) top level
82  * groups.
83  */
84 
85 typedef struct cmt_lgrp {
86 	group_t		cl_pgs;		/* Top level group of active CMT PGs */
87 	int		cl_npgs;	/* # of top level PGs in the lgroup */
88 	lgrp_handle_t	cl_hand;	/* lgroup's platform handle */
89 	struct cmt_lgrp *cl_next;	/* next cmt_lgrp */
90 } cmt_lgrp_t;
91 
92 static cmt_lgrp_t	*cmt_lgrps = NULL;
93 
94 static int		is_cpu0 = 1;
95 static int		cmt_sched_disabled = 0;
96 
97 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
98 
99 static pg_t		*pg_cmt_alloc();
100 static void		pg_cmt_free(pg_t *);
101 static void		pg_cmt_cpu_init(cpu_t *);
102 static void		pg_cmt_cpu_fini(cpu_t *);
103 static void		pg_cmt_cpu_active(cpu_t *);
104 static void		pg_cmt_cpu_inactive(cpu_t *);
105 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
106 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
107 static void		pg_cmt_hier_pack(pg_cmt_t **, int);
108 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
109 static int		pg_cmt_hw(pghw_type_t);
110 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
111 
112 /*
113  * Macro to test if PG is managed by the CMT PG class
114  */
115 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
116 
117 /*
118  * CMT PG ops
119  */
120 struct pg_ops pg_ops_cmt = {
121 	pg_cmt_alloc,
122 	pg_cmt_free,
123 	pg_cmt_cpu_init,
124 	pg_cmt_cpu_fini,
125 	pg_cmt_cpu_active,
126 	pg_cmt_cpu_inactive,
127 	pg_cmt_cpupart_in,
128 	NULL,			/* cpupart_out */
129 	pg_cmt_cpupart_move,
130 	pg_cmt_cpu_belongs,
131 };
132 
133 /*
134  * Initialize the CMT PG class
135  */
136 void
137 pg_cmt_class_init(void)
138 {
139 	if (cmt_sched_disabled)
140 		return;
141 
142 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
143 }
144 
145 /*
146  * Called to indicate a new CPU has started up so
147  * that either t0 or the slave startup thread can
148  * be accounted for.
149  */
150 void
151 pg_cmt_cpu_startup(cpu_t *cp)
152 {
153 	PG_NRUN_UPDATE(cp, 1);
154 }
155 
156 /*
157  * Adjust the CMT load in the CMT PGs in which the CPU belongs
158  * Note that "n" can be positive in the case of increasing
159  * load, or negative in the case of decreasing load.
160  */
161 void
162 pg_cmt_load(cpu_t *cp, int n)
163 {
164 	pg_cmt_t	*pg;
165 
166 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
167 	while (pg != NULL) {
168 		ASSERT(IS_CMT_PG(pg));
169 		atomic_add_32(&pg->cmt_nrunning, n);
170 		pg = pg->cmt_parent;
171 	}
172 }
173 
174 /*
175  * Return non-zero if thread can migrate between "from" and "to"
176  * without a performance penalty
177  */
178 int
179 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
180 {
181 	if (from->cpu_physid->cpu_cacheid ==
182 	    to->cpu_physid->cpu_cacheid)
183 		return (1);
184 	return (0);
185 }
186 
187 /*
188  * CMT class specific PG allocation
189  */
190 static pg_t *
191 pg_cmt_alloc(void)
192 {
193 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
194 }
195 
196 /*
197  * Class specific PG de-allocation
198  */
199 static void
200 pg_cmt_free(pg_t *pg)
201 {
202 	ASSERT(pg != NULL);
203 	ASSERT(IS_CMT_PG(pg));
204 
205 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
206 }
207 
208 /*
209  * Return 1 if CMT load balancing policies should be
210  * implemented across instances of the specified hardware
211  * sharing relationship.
212  */
213 static int
214 pg_cmt_load_bal_hw(pghw_type_t hw)
215 {
216 	if (hw == PGHW_IPIPE ||
217 	    hw == PGHW_FPU ||
218 	    hw == PGHW_CHIP)
219 		return (1);
220 	else
221 		return (0);
222 }
223 
224 /*
225  * Return 1 if thread affinity polices should be implemented
226  * for instances of the specifed hardware sharing relationship.
227  */
228 static int
229 pg_cmt_affinity_hw(pghw_type_t hw)
230 {
231 	if (hw == PGHW_CACHE)
232 		return (1);
233 	else
234 		return (0);
235 }
236 
237 /*
238  * Return 1 if CMT scheduling policies should be impelmented
239  * for the specified hardware sharing relationship.
240  */
241 static int
242 pg_cmt_hw(pghw_type_t hw)
243 {
244 	return (pg_cmt_load_bal_hw(hw) ||
245 	    pg_cmt_affinity_hw(hw));
246 }
247 
248 /*
249  * CMT class callback for a new CPU entering the system
250  */
251 static void
252 pg_cmt_cpu_init(cpu_t *cp)
253 {
254 	pg_cmt_t	*pg;
255 	group_t		*cmt_pgs;
256 	int		level, max_level, nlevels;
257 	pghw_type_t	hw;
258 	pg_t		*pg_cache = NULL;
259 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
260 	lgrp_handle_t	lgrp_handle;
261 	cmt_lgrp_t	*lgrp;
262 
263 	ASSERT(MUTEX_HELD(&cpu_lock));
264 
265 	/*
266 	 * A new CPU is coming into the system.
267 	 * Interrogate the platform to see if the CPU
268 	 * has any performance relevant CMT sharing
269 	 * relationships
270 	 */
271 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
272 	cp->cpu_pg->cmt_lineage = NULL;
273 
274 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
275 	max_level = nlevels = 0;
276 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
277 
278 		/*
279 		 * We're only interested in CMT hw sharing relationships
280 		 */
281 		if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0)
282 			continue;
283 
284 		/*
285 		 * Find (or create) the PG associated with
286 		 * the hw sharing relationship in which cp
287 		 * belongs.
288 		 *
289 		 * Determine if a suitable PG already
290 		 * exists, or if one needs to be created.
291 		 */
292 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
293 		if (pg == NULL) {
294 			/*
295 			 * Create a new one.
296 			 * Initialize the common...
297 			 */
298 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
299 
300 			/* ... physical ... */
301 			pghw_init((pghw_t *)pg, cp, hw);
302 
303 			/*
304 			 * ... and CMT specific portions of the
305 			 * structure.
306 			 */
307 			bitset_init(&pg->cmt_cpus_actv_set);
308 			group_create(&pg->cmt_cpus_actv);
309 		} else {
310 			ASSERT(IS_CMT_PG(pg));
311 		}
312 
313 		/* Add the CPU to the PG */
314 		pg_cpu_add((pg_t *)pg, cp);
315 
316 		/*
317 		 * Ensure capacity of the active CPUs group/bitset
318 		 */
319 		group_expand(&pg->cmt_cpus_actv,
320 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
321 
322 		if (cp->cpu_seqid >=
323 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
324 			bitset_resize(&pg->cmt_cpus_actv_set,
325 			    cp->cpu_seqid + 1);
326 		}
327 
328 		/*
329 		 * Build a lineage of CMT PGs for load balancing
330 		 */
331 		if (pg_cmt_load_bal_hw(hw)) {
332 			level = pghw_level(hw);
333 			cpu_cmt_hier[level] = pg;
334 			if (level > max_level)
335 				max_level = level;
336 			nlevels++;
337 		}
338 
339 		/* Cache this for later */
340 		if (hw == PGHW_CACHE)
341 			pg_cache = (pg_t *)pg;
342 	}
343 
344 	/*
345 	 * Pack out any gaps in the constructed lineage.
346 	 * Gaps may exist where the architecture knows
347 	 * about a hardware sharing relationship, but such a
348 	 * relationship either isn't relevant for load
349 	 * balancing or doesn't exist between CPUs on the system.
350 	 */
351 	pg_cmt_hier_pack(cpu_cmt_hier, max_level + 1);
352 
353 	/*
354 	 * For each of the PGs int the CPU's lineage:
355 	 *	- Add an entry in the CPU sorted CMT PG group
356 	 *	  which is used for top down CMT load balancing
357 	 *	- Tie the PG into the CMT hierarchy by connecting
358 	 *	  it to it's parent and siblings.
359 	 */
360 	group_expand(cmt_pgs, nlevels);
361 
362 	/*
363 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
364 	 */
365 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
366 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
367 
368 	for (level = 0; level < nlevels; level++) {
369 		uint_t		children;
370 		int		err;
371 
372 		pg = cpu_cmt_hier[level];
373 		err = group_add_at(cmt_pgs, pg, nlevels - level - 1);
374 		ASSERT(err == 0);
375 
376 		if (level == 0)
377 			cp->cpu_pg->cmt_lineage = (pg_t *)pg;
378 
379 		if (pg->cmt_siblings != NULL) {
380 			/* Already initialized */
381 			ASSERT(pg->cmt_parent == NULL ||
382 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
383 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
384 			    pg->cmt_siblings == pg->cmt_parent->cmt_children);
385 			continue;
386 		}
387 
388 		if ((level + 1) == nlevels) {
389 			pg->cmt_parent = NULL;
390 			pg->cmt_siblings = &lgrp->cl_pgs;
391 			children = ++lgrp->cl_npgs;
392 		} else {
393 			pg->cmt_parent = cpu_cmt_hier[level + 1];
394 
395 			/*
396 			 * A good parent keeps track of their children.
397 			 * The parent's children group is also the PG's
398 			 * siblings.
399 			 */
400 			if (pg->cmt_parent->cmt_children == NULL) {
401 				pg->cmt_parent->cmt_children =
402 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
403 				group_create(pg->cmt_parent->cmt_children);
404 			}
405 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
406 			children = ++pg->cmt_parent->cmt_nchildren;
407 		}
408 		pg->cmt_hint = 0;
409 		group_expand(pg->cmt_siblings, children);
410 	}
411 
412 	/*
413 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
414 	 * for fast lookups later.
415 	 */
416 	if (cp->cpu_physid) {
417 		cp->cpu_physid->cpu_chipid =
418 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
419 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
420 
421 		/*
422 		 * If this cpu has a PG representing shared cache, then set
423 		 * cpu_cacheid to that PG's logical id
424 		 */
425 		if (pg_cache)
426 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
427 	}
428 
429 	/* CPU0 only initialization */
430 	if (is_cpu0) {
431 		pg_cmt_cpu_startup(cp);
432 		is_cpu0 = 0;
433 	}
434 
435 }
436 
437 /*
438  * Class callback when a CPU is leaving the system (deletion)
439  */
440 static void
441 pg_cmt_cpu_fini(cpu_t *cp)
442 {
443 	group_iter_t	i;
444 	pg_cmt_t	*pg;
445 	group_t		*pgs, *cmt_pgs;
446 	lgrp_handle_t	lgrp_handle;
447 	cmt_lgrp_t	*lgrp;
448 
449 	pgs = &cp->cpu_pg->pgs;
450 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
451 
452 	/*
453 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
454 	 */
455 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
456 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
457 
458 	/*
459 	 * First, clean up anything load balancing specific for each of
460 	 * the CPU's PGs that participated in CMT load balancing
461 	 */
462 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
463 	while (pg != NULL) {
464 
465 		/*
466 		 * Remove the PG from the CPU's load balancing lineage
467 		 */
468 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
469 
470 		/*
471 		 * If it's about to become empty, destroy it's children
472 		 * group, and remove it's reference from it's siblings.
473 		 * This is done here (rather than below) to avoid removing
474 		 * our reference from a PG that we just eliminated.
475 		 */
476 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
477 			if (pg->cmt_children != NULL)
478 				group_destroy(pg->cmt_children);
479 			if (pg->cmt_siblings != NULL) {
480 				if (pg->cmt_siblings == &lgrp->cl_pgs)
481 					lgrp->cl_npgs--;
482 				else
483 					pg->cmt_parent->cmt_nchildren--;
484 			}
485 		}
486 		pg = pg->cmt_parent;
487 	}
488 
489 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
490 
491 	/*
492 	 * Now that the load balancing lineage updates have happened,
493 	 * remove the CPU from all it's PGs (destroying any that become
494 	 * empty).
495 	 */
496 	group_iter_init(&i);
497 	while ((pg = group_iterate(pgs, &i)) != NULL) {
498 		if (IS_CMT_PG(pg) == 0)
499 			continue;
500 
501 		pg_cpu_delete((pg_t *)pg, cp);
502 		/*
503 		 * Deleting the CPU from the PG changes the CPU's
504 		 * PG group over which we are actively iterating
505 		 * Re-initialize the iteration
506 		 */
507 		group_iter_init(&i);
508 
509 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
510 
511 			/*
512 			 * The PG has become zero sized, so destroy it.
513 			 */
514 			group_destroy(&pg->cmt_cpus_actv);
515 			bitset_fini(&pg->cmt_cpus_actv_set);
516 			pghw_fini((pghw_t *)pg);
517 
518 			pg_destroy((pg_t *)pg);
519 		}
520 	}
521 }
522 
523 /*
524  * Class callback when a CPU is entering a cpu partition
525  */
526 static void
527 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
528 {
529 	group_t		*pgs;
530 	pg_t		*pg;
531 	group_iter_t	i;
532 
533 	ASSERT(MUTEX_HELD(&cpu_lock));
534 
535 	pgs = &cp->cpu_pg->pgs;
536 
537 	/*
538 	 * Ensure that the new partition's PG bitset
539 	 * is large enough for all CMT PG's to which cp
540 	 * belongs
541 	 */
542 	group_iter_init(&i);
543 	while ((pg = group_iterate(pgs, &i)) != NULL) {
544 		if (IS_CMT_PG(pg) == 0)
545 			continue;
546 
547 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
548 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
549 	}
550 }
551 
552 /*
553  * Class callback when a CPU is actually moving partitions
554  */
555 static void
556 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
557 {
558 	cpu_t		*cpp;
559 	group_t		*pgs;
560 	pg_t		*pg;
561 	group_iter_t	pg_iter;
562 	pg_cpu_itr_t	cpu_iter;
563 	boolean_t	found;
564 
565 	ASSERT(MUTEX_HELD(&cpu_lock));
566 
567 	pgs = &cp->cpu_pg->pgs;
568 	group_iter_init(&pg_iter);
569 
570 	/*
571 	 * Iterate over the CPUs CMT PGs
572 	 */
573 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
574 
575 		if (IS_CMT_PG(pg) == 0)
576 			continue;
577 
578 		/*
579 		 * Add the PG to the bitset in the new partition.
580 		 */
581 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
582 
583 		/*
584 		 * Remove the PG from the bitset in the old partition
585 		 * if the last of the PG's CPUs have left.
586 		 */
587 		found = B_FALSE;
588 		PG_CPU_ITR_INIT(pg, cpu_iter);
589 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
590 			if (cpp == cp)
591 				continue;
592 			if (cpp->cpu_part->cp_id == oldpp->cp_id) {
593 				found = B_TRUE;
594 				break;
595 			}
596 		}
597 		if (!found)
598 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
599 	}
600 }
601 
602 /*
603  * Class callback when a CPU becomes active (online)
604  *
605  * This is called in a context where CPUs are paused
606  */
607 static void
608 pg_cmt_cpu_active(cpu_t *cp)
609 {
610 	int		err;
611 	group_iter_t	i;
612 	pg_cmt_t	*pg;
613 	group_t		*pgs;
614 
615 	ASSERT(MUTEX_HELD(&cpu_lock));
616 
617 	pgs = &cp->cpu_pg->pgs;
618 	group_iter_init(&i);
619 
620 	/*
621 	 * Iterate over the CPU's PGs
622 	 */
623 	while ((pg = group_iterate(pgs, &i)) != NULL) {
624 
625 		if (IS_CMT_PG(pg) == 0)
626 			continue;
627 
628 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
629 		ASSERT(err == 0);
630 
631 		/*
632 		 * If this is the first active CPU in the PG, and it
633 		 * represents a hardware sharing relationship over which
634 		 * CMT load balancing is performed, add it as a candidate
635 		 * for balancing with it's siblings.
636 		 */
637 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
638 		    pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
639 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
640 			ASSERT(err == 0);
641 		}
642 
643 		/*
644 		 * Notate the CPU in the PGs active CPU bitset.
645 		 * Also notate the PG as being active in it's associated
646 		 * partition
647 		 */
648 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
649 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
650 	}
651 }
652 
653 /*
654  * Class callback when a CPU goes inactive (offline)
655  *
656  * This is called in a context where CPUs are paused
657  */
658 static void
659 pg_cmt_cpu_inactive(cpu_t *cp)
660 {
661 	int		err;
662 	group_t		*pgs;
663 	pg_cmt_t	*pg;
664 	cpu_t		*cpp;
665 	group_iter_t	i;
666 	pg_cpu_itr_t	cpu_itr;
667 	boolean_t	found;
668 
669 	ASSERT(MUTEX_HELD(&cpu_lock));
670 
671 	pgs = &cp->cpu_pg->pgs;
672 	group_iter_init(&i);
673 
674 	while ((pg = group_iterate(pgs, &i)) != NULL) {
675 
676 		if (IS_CMT_PG(pg) == 0)
677 			continue;
678 
679 		/*
680 		 * Remove the CPU from the CMT PGs active CPU group
681 		 * bitmap
682 		 */
683 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
684 		ASSERT(err == 0);
685 
686 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
687 
688 		/*
689 		 * If there are no more active CPUs in this PG over which
690 		 * load was balanced, remove it as a balancing candidate.
691 		 */
692 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
693 		    pg_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
694 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
695 			ASSERT(err == 0);
696 		}
697 
698 		/*
699 		 * Assert the number of active CPUs does not exceed
700 		 * the total number of CPUs in the PG
701 		 */
702 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
703 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
704 
705 		/*
706 		 * Update the PG bitset in the CPU's old partition
707 		 */
708 		found = B_FALSE;
709 		PG_CPU_ITR_INIT(pg, cpu_itr);
710 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
711 			if (cpp == cp)
712 				continue;
713 			if (cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
714 				found = B_TRUE;
715 				break;
716 			}
717 		}
718 		if (!found) {
719 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
720 			    ((pg_t *)pg)->pg_id);
721 		}
722 	}
723 }
724 
725 /*
726  * Return non-zero if the CPU belongs in the given PG
727  */
728 static int
729 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
730 {
731 	cpu_t	*pg_cpu;
732 
733 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
734 
735 	ASSERT(pg_cpu != NULL);
736 
737 	/*
738 	 * The CPU belongs if, given the nature of the hardware sharing
739 	 * relationship represented by the PG, the CPU has that
740 	 * relationship with some other CPU already in the PG
741 	 */
742 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
743 		return (1);
744 
745 	return (0);
746 }
747 
748 /*
749  * Pack the CPUs CMT hierarchy
750  * The hierarchy order is preserved
751  */
752 static void
753 pg_cmt_hier_pack(pg_cmt_t *hier[], int sz)
754 {
755 	int	i, j;
756 
757 	for (i = 0; i < sz; i++) {
758 		if (hier[i] != NULL)
759 			continue;
760 
761 		for (j = i; j < sz; j++) {
762 			if (hier[j] != NULL) {
763 				hier[i] = hier[j];
764 				hier[j] = NULL;
765 				break;
766 			}
767 		}
768 		if (j == sz)
769 			break;
770 	}
771 }
772 
773 /*
774  * Return a cmt_lgrp_t * given an lgroup handle.
775  * If the right one doesn't yet exist, create one
776  * by growing the cmt_lgrps array
777  */
778 static cmt_lgrp_t *
779 pg_cmt_find_lgrp(lgrp_handle_t hand)
780 {
781 	cmt_lgrp_t	*lgrp;
782 
783 	ASSERT(MUTEX_HELD(&cpu_lock));
784 
785 	lgrp = cmt_lgrps;
786 	while (lgrp != NULL) {
787 		if (lgrp->cl_hand == hand)
788 			return (lgrp);
789 		lgrp = lgrp->cl_next;
790 	}
791 
792 	/*
793 	 * Haven't seen this lgrp yet
794 	 */
795 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
796 
797 	lgrp->cl_hand = hand;
798 	lgrp->cl_npgs = 0;
799 	lgrp->cl_next = cmt_lgrps;
800 	cmt_lgrps = lgrp;
801 	group_create(&lgrp->cl_pgs);
802 
803 	return (lgrp);
804 }
805