xref: /illumos-gate/usr/src/uts/common/disp/cmt.c (revision 3d393ee6c37fa10ac512ed6d36109ad616dc7c1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/systm.h>
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/thread.h>
30 #include <sys/cpuvar.h>
31 #include <sys/cpupart.h>
32 #include <sys/kmem.h>
33 #include <sys/cmn_err.h>
34 #include <sys/kstat.h>
35 #include <sys/processor.h>
36 #include <sys/disp.h>
37 #include <sys/group.h>
38 #include <sys/pghw.h>
39 #include <sys/bitset.h>
40 #include <sys/lgrp.h>
41 #include <sys/cmt.h>
42 
43 /*
44  * CMT scheduler / dispatcher support
45  *
46  * This file implements CMT scheduler support using Processor Groups.
47  * The CMT processor group class creates and maintains the CMT class
48  * specific processor group pg_cmt_t.
49  *
50  * ---------------------------- <-- pg_cmt_t *
51  * | pghw_t                   |
52  * ----------------------------
53  * | CMT class specific data  |
54  * | - hierarchy linkage      |
55  * | - CMT load balancing data|
56  * | - active CPU group/bitset|
57  * ----------------------------
58  *
59  * The scheduler/dispatcher leverages knowledge of the performance
60  * relevant CMT sharing relationships existing between cpus to implement
61  * optimized affinity and load balancing policies.
62  *
63  * Load balancing policy seeks to improve performance by minimizing
64  * contention over shared processor resources / facilities, while the
65  * affinity policies seek to improve cache and TLB utilization.
66  *
67  * The CMT PGs created by this class are already arranged into a
68  * hierarchy (which is done in the pghw layer). To implement the top-down
69  * CMT load balancing algorithm, the CMT PGs additionally maintain
70  * parent, child and sibling hierarchy relationships.
71  * Parent PGs always contain a superset of their children(s) resources,
72  * each PG can have at most one parent, and siblings are the group of PGs
73  * sharing the same parent.
74  *
75  * On NUMA systems, the CMT load balancing algorithm balances across the
76  * CMT PGs within their respective lgroups. On UMA based system, there
77  * exists a top level group of PGs to balance across. On NUMA systems multiple
78  * top level groups are instantiated, where the top level balancing begins by
79  * balancng across the CMT PGs within their respective (per lgroup) top level
80  * groups.
81  */
82 typedef struct cmt_lgrp {
83 	group_t		cl_pgs;		/* Top level group of active CMT PGs */
84 	int		cl_npgs;	/* # of top level PGs in the lgroup */
85 	lgrp_handle_t	cl_hand;	/* lgroup's platform handle */
86 	struct cmt_lgrp *cl_next;	/* next cmt_lgrp */
87 } cmt_lgrp_t;
88 
89 static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
90 static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
91 						/* used for null_proc_lpa */
92 static cmt_lgrp_t	*cmt_root = NULL;	/* Reference to root cmt pg */
93 
94 static int		is_cpu0 = 1; /* true if this is boot CPU context */
95 
96 /*
97  * Set this to non-zero to disable CMT scheduling
98  * This must be done via kmdb -d, as /etc/system will be too late
99  */
100 static int		cmt_sched_disabled = 0;
101 
102 static pg_cid_t		pg_cmt_class_id;		/* PG class id */
103 
104 static pg_t		*pg_cmt_alloc();
105 static void		pg_cmt_free(pg_t *);
106 static void		pg_cmt_cpu_init(cpu_t *);
107 static void		pg_cmt_cpu_fini(cpu_t *);
108 static void		pg_cmt_cpu_active(cpu_t *);
109 static void		pg_cmt_cpu_inactive(cpu_t *);
110 static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
111 static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
112 static void		pg_cmt_hier_pack(void **, int);
113 static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
114 static int		pg_cmt_hw(pghw_type_t);
115 static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
116 static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
117 
118 /*
119  * Macro to test if PG is managed by the CMT PG class
120  */
121 #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
122 
123 /*
124  * CMT PG ops
125  */
126 struct pg_ops pg_ops_cmt = {
127 	pg_cmt_alloc,
128 	pg_cmt_free,
129 	pg_cmt_cpu_init,
130 	pg_cmt_cpu_fini,
131 	pg_cmt_cpu_active,
132 	pg_cmt_cpu_inactive,
133 	pg_cmt_cpupart_in,
134 	NULL,			/* cpupart_out */
135 	pg_cmt_cpupart_move,
136 	pg_cmt_cpu_belongs,
137 };
138 
139 /*
140  * Initialize the CMT PG class
141  */
142 void
143 pg_cmt_class_init(void)
144 {
145 	if (cmt_sched_disabled)
146 		return;
147 
148 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
149 }
150 
151 /*
152  * Called to indicate a new CPU has started up so
153  * that either t0 or the slave startup thread can
154  * be accounted for.
155  */
156 void
157 pg_cmt_cpu_startup(cpu_t *cp)
158 {
159 	PG_NRUN_UPDATE(cp, 1);
160 }
161 
162 /*
163  * Adjust the CMT load in the CMT PGs in which the CPU belongs
164  * Note that "n" can be positive in the case of increasing
165  * load, or negative in the case of decreasing load.
166  */
167 void
168 pg_cmt_load(cpu_t *cp, int n)
169 {
170 	pg_cmt_t	*pg;
171 
172 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
173 	while (pg != NULL) {
174 		ASSERT(IS_CMT_PG(pg));
175 		atomic_add_32(&pg->cmt_nrunning, n);
176 		pg = pg->cmt_parent;
177 	}
178 }
179 
180 /*
181  * Return non-zero if thread can migrate between "from" and "to"
182  * without a performance penalty
183  */
184 int
185 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
186 {
187 	if (from->cpu_physid->cpu_cacheid ==
188 	    to->cpu_physid->cpu_cacheid)
189 		return (1);
190 	return (0);
191 }
192 
193 /*
194  * CMT class specific PG allocation
195  */
196 static pg_t *
197 pg_cmt_alloc(void)
198 {
199 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
200 }
201 
202 /*
203  * Class specific PG de-allocation
204  */
205 static void
206 pg_cmt_free(pg_t *pg)
207 {
208 	ASSERT(pg != NULL);
209 	ASSERT(IS_CMT_PG(pg));
210 
211 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
212 }
213 
214 /*
215  * Return 1 if CMT scheduling policies should be impelmented
216  * for the specified hardware sharing relationship.
217  */
218 static int
219 pg_cmt_hw(pghw_type_t hw)
220 {
221 	return (pg_plat_cmt_load_bal_hw(hw) ||
222 	    pg_plat_cmt_affinity_hw(hw));
223 }
224 
225 /*
226  * CMT class callback for a new CPU entering the system
227  */
228 static void
229 pg_cmt_cpu_init(cpu_t *cp)
230 {
231 	pg_cmt_t	*pg;
232 	group_t		*cmt_pgs;
233 	int		level, max_level, nlevels;
234 	pghw_type_t	hw;
235 	pg_t		*pg_cache = NULL;
236 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
237 	lgrp_handle_t	lgrp_handle;
238 	cmt_lgrp_t	*lgrp;
239 
240 	ASSERT(MUTEX_HELD(&cpu_lock));
241 
242 	/*
243 	 * A new CPU is coming into the system.
244 	 * Interrogate the platform to see if the CPU
245 	 * has any performance relevant CMT sharing
246 	 * relationships
247 	 */
248 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
249 	cp->cpu_pg->cmt_lineage = NULL;
250 
251 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
252 	max_level = nlevels = 0;
253 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
254 
255 		/*
256 		 * We're only interested in CMT hw sharing relationships
257 		 */
258 		if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0)
259 			continue;
260 
261 		/*
262 		 * Find (or create) the PG associated with
263 		 * the hw sharing relationship in which cp
264 		 * belongs.
265 		 *
266 		 * Determine if a suitable PG already
267 		 * exists, or if one needs to be created.
268 		 */
269 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
270 		if (pg == NULL) {
271 			/*
272 			 * Create a new one.
273 			 * Initialize the common...
274 			 */
275 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
276 
277 			/* ... physical ... */
278 			pghw_init((pghw_t *)pg, cp, hw);
279 
280 			/*
281 			 * ... and CMT specific portions of the
282 			 * structure.
283 			 */
284 			bitset_init(&pg->cmt_cpus_actv_set);
285 			group_create(&pg->cmt_cpus_actv);
286 		} else {
287 			ASSERT(IS_CMT_PG(pg));
288 		}
289 
290 		/* Add the CPU to the PG */
291 		pg_cpu_add((pg_t *)pg, cp);
292 
293 		/*
294 		 * Ensure capacity of the active CPU group/bitset
295 		 */
296 		group_expand(&pg->cmt_cpus_actv,
297 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
298 
299 		if (cp->cpu_seqid >=
300 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
301 			bitset_resize(&pg->cmt_cpus_actv_set,
302 			    cp->cpu_seqid + 1);
303 		}
304 
305 		/*
306 		 * Build a lineage of CMT PGs for load balancing
307 		 */
308 		if (pg_plat_cmt_load_bal_hw(hw)) {
309 			level = pghw_level(hw);
310 			cpu_cmt_hier[level] = pg;
311 			if (level > max_level)
312 				max_level = level;
313 			nlevels++;
314 		}
315 
316 		/* Cache this for later */
317 		if (hw == PGHW_CACHE)
318 			pg_cache = (pg_t *)pg;
319 	}
320 
321 	/*
322 	 * Pack out any gaps in the constructed lineage,
323 	 * then size it out.
324 	 *
325 	 * Gaps may exist where the architecture knows
326 	 * about a hardware sharing relationship, but such a
327 	 * relationship either isn't relevant for load
328 	 * balancing or doesn't exist between CPUs on the system.
329 	 */
330 	pg_cmt_hier_pack((void **)cpu_cmt_hier, max_level + 1);
331 	group_expand(cmt_pgs, nlevels);
332 
333 
334 	if (cmt_root == NULL)
335 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
336 
337 	/*
338 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy.
339 	 * and locate/create a suitable cmt_lgrp_t.
340 	 */
341 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
342 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
343 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
344 
345 	/*
346 	 * For each of the PGs in the CPU's lineage:
347 	 *	- Add an entry in the CPU's CMT PG group
348 	 *	  which is used by the dispatcher to implement load balancing
349 	 *	  policy.
350 	 *	- Tie the PG into the CMT hierarchy by connecting
351 	 *	  it to it's parent and siblings.
352 	 */
353 	for (level = 0; level < nlevels; level++) {
354 		uint_t		children;
355 		int		err;
356 
357 		pg = cpu_cmt_hier[level];
358 		err = group_add_at(cmt_pgs, pg, nlevels - level - 1);
359 		ASSERT(err == 0);
360 
361 		if (level == 0)
362 			cp->cpu_pg->cmt_lineage = (pg_t *)pg;
363 
364 		if (pg->cmt_siblings != NULL) {
365 			/* Already initialized */
366 			ASSERT(pg->cmt_parent == NULL ||
367 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
368 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
369 			    ((pg->cmt_parent != NULL) &&
370 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
371 			continue;
372 		}
373 
374 		if ((level + 1) == nlevels) {
375 			pg->cmt_parent = NULL;
376 
377 			pg->cmt_siblings = &lgrp->cl_pgs;
378 			children = ++lgrp->cl_npgs;
379 			cmt_root->cl_npgs++;
380 		} else {
381 			pg->cmt_parent = cpu_cmt_hier[level + 1];
382 
383 			/*
384 			 * A good parent keeps track of their children.
385 			 * The parent's children group is also the PG's
386 			 * siblings.
387 			 */
388 			if (pg->cmt_parent->cmt_children == NULL) {
389 				pg->cmt_parent->cmt_children =
390 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
391 				group_create(pg->cmt_parent->cmt_children);
392 			}
393 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
394 			children = ++pg->cmt_parent->cmt_nchildren;
395 		}
396 
397 		group_expand(pg->cmt_siblings, children);
398 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
399 	}
400 
401 	/*
402 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
403 	 * for fast lookups later.
404 	 */
405 	if (cp->cpu_physid) {
406 		cp->cpu_physid->cpu_chipid =
407 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
408 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
409 
410 		/*
411 		 * If this cpu has a PG representing shared cache, then set
412 		 * cpu_cacheid to that PG's logical id
413 		 */
414 		if (pg_cache)
415 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
416 	}
417 
418 	/* CPU0 only initialization */
419 	if (is_cpu0) {
420 		pg_cmt_cpu_startup(cp);
421 		is_cpu0 = 0;
422 		cpu0_lgrp = lgrp;
423 	}
424 
425 }
426 
427 /*
428  * Class callback when a CPU is leaving the system (deletion)
429  */
430 static void
431 pg_cmt_cpu_fini(cpu_t *cp)
432 {
433 	group_iter_t	i;
434 	pg_cmt_t	*pg;
435 	group_t		*pgs, *cmt_pgs;
436 	lgrp_handle_t	lgrp_handle;
437 	cmt_lgrp_t	*lgrp;
438 
439 	pgs = &cp->cpu_pg->pgs;
440 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
441 
442 	/*
443 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
444 	 */
445 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
446 
447 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
448 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
449 		/*
450 		 * One might wonder how we could be deconfiguring the
451 		 * only CPU in the system.
452 		 *
453 		 * On Starcat systems when null_proc_lpa is detected,
454 		 * the boot CPU (which is already configured into a leaf
455 		 * lgroup), is moved into the root lgroup. This is done by
456 		 * deconfiguring it from both lgroups and processor
457 		 * groups), and then later reconfiguring it back in.  This
458 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
459 		 *
460 		 * This special case is detected by noting that the platform
461 		 * has changed the CPU's lgrp affiliation (since it now
462 		 * belongs in the root). In this case, use the cmt_lgrp_t
463 		 * cached for the boot CPU, since this is what needs to be
464 		 * torn down.
465 		 */
466 		lgrp = cpu0_lgrp;
467 	}
468 
469 	ASSERT(lgrp != NULL);
470 
471 	/*
472 	 * First, clean up anything load balancing specific for each of
473 	 * the CPU's PGs that participated in CMT load balancing
474 	 */
475 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
476 	while (pg != NULL) {
477 
478 		/*
479 		 * Remove the PG from the CPU's load balancing lineage
480 		 */
481 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
482 
483 		/*
484 		 * If it's about to become empty, destroy it's children
485 		 * group, and remove it's reference from it's siblings.
486 		 * This is done here (rather than below) to avoid removing
487 		 * our reference from a PG that we just eliminated.
488 		 */
489 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
490 			if (pg->cmt_children != NULL)
491 				group_destroy(pg->cmt_children);
492 			if (pg->cmt_siblings != NULL) {
493 				if (pg->cmt_siblings == &lgrp->cl_pgs)
494 					lgrp->cl_npgs--;
495 				else
496 					pg->cmt_parent->cmt_nchildren--;
497 			}
498 		}
499 		pg = pg->cmt_parent;
500 	}
501 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
502 
503 	/*
504 	 * Now that the load balancing lineage updates have happened,
505 	 * remove the CPU from all it's PGs (destroying any that become
506 	 * empty).
507 	 */
508 	group_iter_init(&i);
509 	while ((pg = group_iterate(pgs, &i)) != NULL) {
510 		if (IS_CMT_PG(pg) == 0)
511 			continue;
512 
513 		pg_cpu_delete((pg_t *)pg, cp);
514 		/*
515 		 * Deleting the CPU from the PG changes the CPU's
516 		 * PG group over which we are actively iterating
517 		 * Re-initialize the iteration
518 		 */
519 		group_iter_init(&i);
520 
521 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
522 
523 			/*
524 			 * The PG has become zero sized, so destroy it.
525 			 */
526 			group_destroy(&pg->cmt_cpus_actv);
527 			bitset_fini(&pg->cmt_cpus_actv_set);
528 			pghw_fini((pghw_t *)pg);
529 
530 			pg_destroy((pg_t *)pg);
531 		}
532 	}
533 }
534 
535 /*
536  * Class callback when a CPU is entering a cpu partition
537  */
538 static void
539 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
540 {
541 	group_t		*pgs;
542 	pg_t		*pg;
543 	group_iter_t	i;
544 
545 	ASSERT(MUTEX_HELD(&cpu_lock));
546 
547 	pgs = &cp->cpu_pg->pgs;
548 
549 	/*
550 	 * Ensure that the new partition's PG bitset
551 	 * is large enough for all CMT PG's to which cp
552 	 * belongs
553 	 */
554 	group_iter_init(&i);
555 	while ((pg = group_iterate(pgs, &i)) != NULL) {
556 		if (IS_CMT_PG(pg) == 0)
557 			continue;
558 
559 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
560 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
561 	}
562 }
563 
564 /*
565  * Class callback when a CPU is actually moving partitions
566  */
567 static void
568 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
569 {
570 	cpu_t		*cpp;
571 	group_t		*pgs;
572 	pg_t		*pg;
573 	group_iter_t	pg_iter;
574 	pg_cpu_itr_t	cpu_iter;
575 	boolean_t	found;
576 
577 	ASSERT(MUTEX_HELD(&cpu_lock));
578 
579 	pgs = &cp->cpu_pg->pgs;
580 	group_iter_init(&pg_iter);
581 
582 	/*
583 	 * Iterate over the CPUs CMT PGs
584 	 */
585 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
586 
587 		if (IS_CMT_PG(pg) == 0)
588 			continue;
589 
590 		/*
591 		 * Add the PG to the bitset in the new partition.
592 		 */
593 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
594 
595 		/*
596 		 * Remove the PG from the bitset in the old partition
597 		 * if the last of the PG's CPUs have left.
598 		 */
599 		found = B_FALSE;
600 		PG_CPU_ITR_INIT(pg, cpu_iter);
601 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
602 			if (cpp == cp)
603 				continue;
604 			if (CPU_ACTIVE(cpp) &&
605 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
606 				found = B_TRUE;
607 				break;
608 			}
609 		}
610 		if (!found)
611 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
612 	}
613 }
614 
615 /*
616  * Class callback when a CPU becomes active (online)
617  *
618  * This is called in a context where CPUs are paused
619  */
620 static void
621 pg_cmt_cpu_active(cpu_t *cp)
622 {
623 	int		err;
624 	group_iter_t	i;
625 	pg_cmt_t	*pg;
626 	group_t		*pgs;
627 
628 	ASSERT(MUTEX_HELD(&cpu_lock));
629 
630 	pgs = &cp->cpu_pg->pgs;
631 	group_iter_init(&i);
632 
633 	/*
634 	 * Iterate over the CPU's PGs
635 	 */
636 	while ((pg = group_iterate(pgs, &i)) != NULL) {
637 
638 		if (IS_CMT_PG(pg) == 0)
639 			continue;
640 
641 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
642 		ASSERT(err == 0);
643 
644 		/*
645 		 * If this is the first active CPU in the PG, and it
646 		 * represents a hardware sharing relationship over which
647 		 * CMT load balancing is performed, add it as a candidate
648 		 * for balancing with it's siblings.
649 		 */
650 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
651 		    pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
652 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
653 			ASSERT(err == 0);
654 
655 			/*
656 			 * If this is a top level PG, add it as a balancing
657 			 * candidate when balancing within the root lgroup
658 			 */
659 			if (pg->cmt_parent == NULL) {
660 				err = group_add(&cmt_root->cl_pgs, pg,
661 				    GRP_NORESIZE);
662 				ASSERT(err == 0);
663 			}
664 		}
665 
666 		/*
667 		 * Notate the CPU in the PGs active CPU bitset.
668 		 * Also notate the PG as being active in it's associated
669 		 * partition
670 		 */
671 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
672 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
673 	}
674 }
675 
676 /*
677  * Class callback when a CPU goes inactive (offline)
678  *
679  * This is called in a context where CPUs are paused
680  */
681 static void
682 pg_cmt_cpu_inactive(cpu_t *cp)
683 {
684 	int		err;
685 	group_t		*pgs;
686 	pg_cmt_t	*pg;
687 	cpu_t		*cpp;
688 	group_iter_t	i;
689 	pg_cpu_itr_t	cpu_itr;
690 	boolean_t	found;
691 
692 	ASSERT(MUTEX_HELD(&cpu_lock));
693 
694 	pgs = &cp->cpu_pg->pgs;
695 	group_iter_init(&i);
696 
697 	while ((pg = group_iterate(pgs, &i)) != NULL) {
698 
699 		if (IS_CMT_PG(pg) == 0)
700 			continue;
701 
702 		/*
703 		 * Remove the CPU from the CMT PGs active CPU group
704 		 * bitmap
705 		 */
706 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
707 		ASSERT(err == 0);
708 
709 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
710 
711 		/*
712 		 * If there are no more active CPUs in this PG over which
713 		 * load was balanced, remove it as a balancing candidate.
714 		 */
715 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
716 		    pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
717 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
718 			ASSERT(err == 0);
719 
720 			if (pg->cmt_parent == NULL) {
721 				err = group_remove(&cmt_root->cl_pgs, pg,
722 				    GRP_NORESIZE);
723 				ASSERT(err == 0);
724 			}
725 		}
726 
727 		/*
728 		 * Assert the number of active CPUs does not exceed
729 		 * the total number of CPUs in the PG
730 		 */
731 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
732 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
733 
734 		/*
735 		 * Update the PG bitset in the CPU's old partition
736 		 */
737 		found = B_FALSE;
738 		PG_CPU_ITR_INIT(pg, cpu_itr);
739 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
740 			if (cpp == cp)
741 				continue;
742 			if (CPU_ACTIVE(cpp) &&
743 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
744 				found = B_TRUE;
745 				break;
746 			}
747 		}
748 		if (!found) {
749 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
750 			    ((pg_t *)pg)->pg_id);
751 		}
752 	}
753 }
754 
755 /*
756  * Return non-zero if the CPU belongs in the given PG
757  */
758 static int
759 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
760 {
761 	cpu_t	*pg_cpu;
762 
763 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
764 
765 	ASSERT(pg_cpu != NULL);
766 
767 	/*
768 	 * The CPU belongs if, given the nature of the hardware sharing
769 	 * relationship represented by the PG, the CPU has that
770 	 * relationship with some other CPU already in the PG
771 	 */
772 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
773 		return (1);
774 
775 	return (0);
776 }
777 
778 /*
779  * Hierarchy packing utility routine. The hierarchy order is preserved.
780  */
781 static void
782 pg_cmt_hier_pack(void *hier[], int sz)
783 {
784 	int	i, j;
785 
786 	for (i = 0; i < sz; i++) {
787 		if (hier[i] != NULL)
788 			continue;
789 
790 		for (j = i; j < sz; j++) {
791 			if (hier[j] != NULL) {
792 				hier[i] = hier[j];
793 				hier[j] = NULL;
794 				break;
795 			}
796 		}
797 		if (j == sz)
798 			break;
799 	}
800 }
801 
802 /*
803  * Return a cmt_lgrp_t * given an lgroup handle.
804  */
805 static cmt_lgrp_t *
806 pg_cmt_find_lgrp(lgrp_handle_t hand)
807 {
808 	cmt_lgrp_t	*lgrp;
809 
810 	ASSERT(MUTEX_HELD(&cpu_lock));
811 
812 	lgrp = cmt_lgrps;
813 	while (lgrp != NULL) {
814 		if (lgrp->cl_hand == hand)
815 			break;
816 		lgrp = lgrp->cl_next;
817 	}
818 	return (lgrp);
819 }
820 
821 /*
822  * Create a cmt_lgrp_t with the specified handle.
823  */
824 static cmt_lgrp_t *
825 pg_cmt_lgrp_create(lgrp_handle_t hand)
826 {
827 	cmt_lgrp_t	*lgrp;
828 
829 	ASSERT(MUTEX_HELD(&cpu_lock));
830 
831 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
832 
833 	lgrp->cl_hand = hand;
834 	lgrp->cl_npgs = 0;
835 	lgrp->cl_next = cmt_lgrps;
836 	cmt_lgrps = lgrp;
837 	group_create(&lgrp->cl_pgs);
838 
839 	return (lgrp);
840 }
841 
842 /*
843  * Perform multi-level CMT load balancing of running threads.
844  *
845  * tp is the thread being enqueued.
846  * cp is a hint CPU, against which CMT load balancing will be performed.
847  *
848  * Returns cp, or a CPU better than cp with respect to balancing
849  * running thread load.
850  */
851 cpu_t *
852 cmt_balance(kthread_t *tp, cpu_t *cp)
853 {
854 	int		hint, i, cpu, nsiblings;
855 	int		self = 0;
856 	group_t		*cmt_pgs, *siblings;
857 	pg_cmt_t	*pg, *pg_tmp, *tpg = NULL;
858 	int		pg_nrun, tpg_nrun;
859 	int		level = 0;
860 	cpu_t		*newcp;
861 
862 	ASSERT(THREAD_LOCK_HELD(tp));
863 
864 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
865 
866 	if (GROUP_SIZE(cmt_pgs) == 0)
867 		return (cp);	/* nothing to do */
868 
869 	if (tp == curthread)
870 		self = 1;
871 
872 	/*
873 	 * Balance across siblings in the CPUs CMT lineage
874 	 * If the thread is homed to the root lgroup, perform
875 	 * top level balancing against other top level PGs
876 	 * in the system. Otherwise, start with the default
877 	 * top level siblings group, which is within the leaf lgroup
878 	 */
879 	pg = GROUP_ACCESS(cmt_pgs, level);
880 	if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
881 		siblings = &cmt_root->cl_pgs;
882 	else
883 		siblings = pg->cmt_siblings;
884 
885 	/*
886 	 * Traverse down the lineage until we find a level that needs
887 	 * balancing, or we get to the end.
888 	 */
889 	for (;;) {
890 		nsiblings = GROUP_SIZE(siblings);	/* self inclusive */
891 		if (nsiblings == 1)
892 			goto next_level;
893 
894 		pg_nrun = pg->cmt_nrunning;
895 		if (self &&
896 		    bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid))
897 			pg_nrun--;	/* Ignore curthread's effect */
898 
899 		hint = CPU_PSEUDO_RANDOM() % nsiblings;
900 
901 		/*
902 		 * Find a balancing candidate from among our siblings
903 		 * "hint" is a hint for where to start looking
904 		 */
905 		i = hint;
906 		do {
907 			ASSERT(i < nsiblings);
908 			pg_tmp = GROUP_ACCESS(siblings, i);
909 
910 			/*
911 			 * The candidate must not be us, and must
912 			 * have some CPU resources in the thread's
913 			 * partition
914 			 */
915 			if (pg_tmp != pg &&
916 			    bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
917 			    ((pg_t *)pg_tmp)->pg_id)) {
918 				tpg = pg_tmp;
919 				break;
920 			}
921 
922 			if (++i >= nsiblings)
923 				i = 0;
924 		} while (i != hint);
925 
926 		if (!tpg)
927 			goto next_level; /* no candidates at this level */
928 
929 		/*
930 		 * Check if the balancing target is underloaded
931 		 * Decide to balance if the target is running fewer
932 		 * threads, or if it's running the same number of threads
933 		 * with more online CPUs
934 		 */
935 		tpg_nrun = tpg->cmt_nrunning;
936 		if (pg_nrun > tpg_nrun ||
937 		    (pg_nrun == tpg_nrun &&
938 		    (GROUP_SIZE(&tpg->cmt_cpus_actv) >
939 		    GROUP_SIZE(&pg->cmt_cpus_actv)))) {
940 			break;
941 		}
942 		tpg = NULL;
943 
944 next_level:
945 		if (++level == GROUP_SIZE(cmt_pgs))
946 			break;
947 
948 		pg = GROUP_ACCESS(cmt_pgs, level);
949 		siblings = pg->cmt_siblings;
950 	}
951 
952 	if (tpg) {
953 		uint_t	tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv);
954 
955 		/*
956 		 * Select an idle CPU from the target
957 		 */
958 		hint = CPU_PSEUDO_RANDOM() % tgt_size;
959 		cpu = hint;
960 		do {
961 			newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
962 			if (newcp->cpu_part == tp->t_cpupart &&
963 			    newcp->cpu_dispatch_pri == -1) {
964 				cp = newcp;
965 				break;
966 			}
967 			if (++cpu == tgt_size)
968 				cpu = 0;
969 		} while (cpu != hint);
970 	}
971 
972 	return (cp);
973 }
974