xref: /titanic_50/usr/src/uts/common/disp/cmt.c (revision 3e81cacf8672b40d79c410d4b7858729d77912ff)
1fb2f18f8Sesaxe /*
2fb2f18f8Sesaxe  * CDDL HEADER START
3fb2f18f8Sesaxe  *
4fb2f18f8Sesaxe  * The contents of this file are subject to the terms of the
5fb2f18f8Sesaxe  * Common Development and Distribution License (the "License").
6fb2f18f8Sesaxe  * You may not use this file except in compliance with the License.
7fb2f18f8Sesaxe  *
8fb2f18f8Sesaxe  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fb2f18f8Sesaxe  * or http://www.opensolaris.org/os/licensing.
10fb2f18f8Sesaxe  * See the License for the specific language governing permissions
11fb2f18f8Sesaxe  * and limitations under the License.
12fb2f18f8Sesaxe  *
13fb2f18f8Sesaxe  * When distributing Covered Code, include this CDDL HEADER in each
14fb2f18f8Sesaxe  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fb2f18f8Sesaxe  * If applicable, add the following below this CDDL HEADER, with the
16fb2f18f8Sesaxe  * fields enclosed by brackets "[]" replaced with your own identifying
17fb2f18f8Sesaxe  * information: Portions Copyright [yyyy] [name of copyright owner]
18fb2f18f8Sesaxe  *
19fb2f18f8Sesaxe  * CDDL HEADER END
20fb2f18f8Sesaxe  */
21fb2f18f8Sesaxe /*
22*3e81cacfSEric Saxe  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23fb2f18f8Sesaxe  * Use is subject to license terms.
24fb2f18f8Sesaxe  */
25fb2f18f8Sesaxe 
26fb2f18f8Sesaxe #include <sys/systm.h>
27fb2f18f8Sesaxe #include <sys/types.h>
28fb2f18f8Sesaxe #include <sys/param.h>
29fb2f18f8Sesaxe #include <sys/thread.h>
30fb2f18f8Sesaxe #include <sys/cpuvar.h>
31fb2f18f8Sesaxe #include <sys/cpupart.h>
32fb2f18f8Sesaxe #include <sys/kmem.h>
33fb2f18f8Sesaxe #include <sys/cmn_err.h>
34fb2f18f8Sesaxe #include <sys/kstat.h>
35fb2f18f8Sesaxe #include <sys/processor.h>
36fb2f18f8Sesaxe #include <sys/disp.h>
37fb2f18f8Sesaxe #include <sys/group.h>
38fb2f18f8Sesaxe #include <sys/pghw.h>
39fb2f18f8Sesaxe #include <sys/bitset.h>
40fb2f18f8Sesaxe #include <sys/lgrp.h>
41fb2f18f8Sesaxe #include <sys/cmt.h>
42fb2f18f8Sesaxe 
43fb2f18f8Sesaxe /*
44fb2f18f8Sesaxe  * CMT scheduler / dispatcher support
45fb2f18f8Sesaxe  *
46fb2f18f8Sesaxe  * This file implements CMT scheduler support using Processor Groups.
47fb2f18f8Sesaxe  * The CMT processor group class creates and maintains the CMT class
48fb2f18f8Sesaxe  * specific processor group pg_cmt_t.
49fb2f18f8Sesaxe  *
50fb2f18f8Sesaxe  * ---------------------------- <-- pg_cmt_t *
51fb2f18f8Sesaxe  * | pghw_t                   |
52fb2f18f8Sesaxe  * ----------------------------
53fb2f18f8Sesaxe  * | CMT class specific data  |
54fb2f18f8Sesaxe  * | - hierarchy linkage      |
55fb2f18f8Sesaxe  * | - CMT load balancing data|
56fb2f18f8Sesaxe  * | - active CPU group/bitset|
57fb2f18f8Sesaxe  * ----------------------------
58fb2f18f8Sesaxe  *
59fb2f18f8Sesaxe  * The scheduler/dispatcher leverages knowledge of the performance
60fb2f18f8Sesaxe  * relevant CMT sharing relationships existing between cpus to implement
61fb2f18f8Sesaxe  * optimized affinity and load balancing policies.
62fb2f18f8Sesaxe  *
63fb2f18f8Sesaxe  * Load balancing policy seeks to improve performance by minimizing
64fb2f18f8Sesaxe  * contention over shared processor resources / facilities, while the
65fb2f18f8Sesaxe  * affinity policies seek to improve cache and TLB utilization.
66fb2f18f8Sesaxe  *
67fb2f18f8Sesaxe  * The CMT PGs created by this class are already arranged into a
68fb2f18f8Sesaxe  * hierarchy (which is done in the pghw layer). To implement the top-down
69fb2f18f8Sesaxe  * CMT load balancing algorithm, the CMT PGs additionally maintain
70fb2f18f8Sesaxe  * parent, child and sibling hierarchy relationships.
71fb2f18f8Sesaxe  * Parent PGs always contain a superset of their children(s) resources,
72fb2f18f8Sesaxe  * each PG can have at most one parent, and siblings are the group of PGs
73fb2f18f8Sesaxe  * sharing the same parent.
74fb2f18f8Sesaxe  *
75fb2f18f8Sesaxe  * On NUMA systems, the CMT load balancing algorithm balances across the
76fb2f18f8Sesaxe  * CMT PGs within their respective lgroups. On UMA based system, there
77fb2f18f8Sesaxe  * exists a top level group of PGs to balance across. On NUMA systems multiple
78fb2f18f8Sesaxe  * top level groups are instantiated, where the top level balancing begins by
79fb2f18f8Sesaxe  * balancng across the CMT PGs within their respective (per lgroup) top level
80fb2f18f8Sesaxe  * groups.
81fb2f18f8Sesaxe  */
82fb2f18f8Sesaxe typedef struct cmt_lgrp {
83fb2f18f8Sesaxe 	group_t		cl_pgs;		/* Top level group of active CMT PGs */
84fb2f18f8Sesaxe 	int		cl_npgs;	/* # of top level PGs in the lgroup */
85fb2f18f8Sesaxe 	lgrp_handle_t	cl_hand;	/* lgroup's platform handle */
86fb2f18f8Sesaxe 	struct cmt_lgrp *cl_next;	/* next cmt_lgrp */
87fb2f18f8Sesaxe } cmt_lgrp_t;
88fb2f18f8Sesaxe 
89a6604450Sesaxe static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
90a6604450Sesaxe static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
91a6604450Sesaxe 						/* used for null_proc_lpa */
926890d023SEric Saxe static cmt_lgrp_t	*cmt_root = NULL;	/* Reference to root cmt pg */
93fb2f18f8Sesaxe 
94a6604450Sesaxe static int		is_cpu0 = 1; /* true if this is boot CPU context */
95a6604450Sesaxe 
96a6604450Sesaxe /*
97a6604450Sesaxe  * Set this to non-zero to disable CMT scheduling
98a6604450Sesaxe  * This must be done via kmdb -d, as /etc/system will be too late
99a6604450Sesaxe  */
100fb2f18f8Sesaxe static int		cmt_sched_disabled = 0;
101fb2f18f8Sesaxe 
102fb2f18f8Sesaxe static pg_cid_t		pg_cmt_class_id;		/* PG class id */
103fb2f18f8Sesaxe 
104fb2f18f8Sesaxe static pg_t		*pg_cmt_alloc();
105fb2f18f8Sesaxe static void		pg_cmt_free(pg_t *);
106fb2f18f8Sesaxe static void		pg_cmt_cpu_init(cpu_t *);
107fb2f18f8Sesaxe static void		pg_cmt_cpu_fini(cpu_t *);
108fb2f18f8Sesaxe static void		pg_cmt_cpu_active(cpu_t *);
109fb2f18f8Sesaxe static void		pg_cmt_cpu_inactive(cpu_t *);
110fb2f18f8Sesaxe static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
111fb2f18f8Sesaxe static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
1126890d023SEric Saxe static void		pg_cmt_hier_pack(void **, int);
113fb2f18f8Sesaxe static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
114fb2f18f8Sesaxe static int		pg_cmt_hw(pghw_type_t);
115fb2f18f8Sesaxe static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
116a6604450Sesaxe static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
117fb2f18f8Sesaxe 
118fb2f18f8Sesaxe /*
119fb2f18f8Sesaxe  * Macro to test if PG is managed by the CMT PG class
120fb2f18f8Sesaxe  */
121fb2f18f8Sesaxe #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
122fb2f18f8Sesaxe 
123fb2f18f8Sesaxe /*
124fb2f18f8Sesaxe  * CMT PG ops
125fb2f18f8Sesaxe  */
126fb2f18f8Sesaxe struct pg_ops pg_ops_cmt = {
127fb2f18f8Sesaxe 	pg_cmt_alloc,
128fb2f18f8Sesaxe 	pg_cmt_free,
129fb2f18f8Sesaxe 	pg_cmt_cpu_init,
130fb2f18f8Sesaxe 	pg_cmt_cpu_fini,
131fb2f18f8Sesaxe 	pg_cmt_cpu_active,
132fb2f18f8Sesaxe 	pg_cmt_cpu_inactive,
133fb2f18f8Sesaxe 	pg_cmt_cpupart_in,
134fb2f18f8Sesaxe 	NULL,			/* cpupart_out */
135fb2f18f8Sesaxe 	pg_cmt_cpupart_move,
136fb2f18f8Sesaxe 	pg_cmt_cpu_belongs,
137fb2f18f8Sesaxe };
138fb2f18f8Sesaxe 
139fb2f18f8Sesaxe /*
140fb2f18f8Sesaxe  * Initialize the CMT PG class
141fb2f18f8Sesaxe  */
142fb2f18f8Sesaxe void
143fb2f18f8Sesaxe pg_cmt_class_init(void)
144fb2f18f8Sesaxe {
145fb2f18f8Sesaxe 	if (cmt_sched_disabled)
146fb2f18f8Sesaxe 		return;
147fb2f18f8Sesaxe 
148fb2f18f8Sesaxe 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
149fb2f18f8Sesaxe }
150fb2f18f8Sesaxe 
151fb2f18f8Sesaxe /*
152fb2f18f8Sesaxe  * Called to indicate a new CPU has started up so
153fb2f18f8Sesaxe  * that either t0 or the slave startup thread can
154fb2f18f8Sesaxe  * be accounted for.
155fb2f18f8Sesaxe  */
156fb2f18f8Sesaxe void
157fb2f18f8Sesaxe pg_cmt_cpu_startup(cpu_t *cp)
158fb2f18f8Sesaxe {
159fb2f18f8Sesaxe 	PG_NRUN_UPDATE(cp, 1);
160fb2f18f8Sesaxe }
161fb2f18f8Sesaxe 
162fb2f18f8Sesaxe /*
163fb2f18f8Sesaxe  * Adjust the CMT load in the CMT PGs in which the CPU belongs
164fb2f18f8Sesaxe  * Note that "n" can be positive in the case of increasing
165fb2f18f8Sesaxe  * load, or negative in the case of decreasing load.
166fb2f18f8Sesaxe  */
167fb2f18f8Sesaxe void
168fb2f18f8Sesaxe pg_cmt_load(cpu_t *cp, int n)
169fb2f18f8Sesaxe {
170fb2f18f8Sesaxe 	pg_cmt_t	*pg;
171fb2f18f8Sesaxe 
172fb2f18f8Sesaxe 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
173fb2f18f8Sesaxe 	while (pg != NULL) {
174fb2f18f8Sesaxe 		ASSERT(IS_CMT_PG(pg));
175fb2f18f8Sesaxe 		atomic_add_32(&pg->cmt_nrunning, n);
176fb2f18f8Sesaxe 		pg = pg->cmt_parent;
177fb2f18f8Sesaxe 	}
178fb2f18f8Sesaxe }
179fb2f18f8Sesaxe 
180fb2f18f8Sesaxe /*
181fb2f18f8Sesaxe  * Return non-zero if thread can migrate between "from" and "to"
182fb2f18f8Sesaxe  * without a performance penalty
183fb2f18f8Sesaxe  */
184fb2f18f8Sesaxe int
185fb2f18f8Sesaxe pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
186fb2f18f8Sesaxe {
187fb2f18f8Sesaxe 	if (from->cpu_physid->cpu_cacheid ==
188fb2f18f8Sesaxe 	    to->cpu_physid->cpu_cacheid)
189fb2f18f8Sesaxe 		return (1);
190fb2f18f8Sesaxe 	return (0);
191fb2f18f8Sesaxe }
192fb2f18f8Sesaxe 
193fb2f18f8Sesaxe /*
194fb2f18f8Sesaxe  * CMT class specific PG allocation
195fb2f18f8Sesaxe  */
196fb2f18f8Sesaxe static pg_t *
197fb2f18f8Sesaxe pg_cmt_alloc(void)
198fb2f18f8Sesaxe {
199fb2f18f8Sesaxe 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
200fb2f18f8Sesaxe }
201fb2f18f8Sesaxe 
202fb2f18f8Sesaxe /*
203fb2f18f8Sesaxe  * Class specific PG de-allocation
204fb2f18f8Sesaxe  */
205fb2f18f8Sesaxe static void
206fb2f18f8Sesaxe pg_cmt_free(pg_t *pg)
207fb2f18f8Sesaxe {
208fb2f18f8Sesaxe 	ASSERT(pg != NULL);
209fb2f18f8Sesaxe 	ASSERT(IS_CMT_PG(pg));
210fb2f18f8Sesaxe 
211fb2f18f8Sesaxe 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
212fb2f18f8Sesaxe }
213fb2f18f8Sesaxe 
214fb2f18f8Sesaxe /*
215fb2f18f8Sesaxe  * Return 1 if CMT scheduling policies should be impelmented
216fb2f18f8Sesaxe  * for the specified hardware sharing relationship.
217fb2f18f8Sesaxe  */
218fb2f18f8Sesaxe static int
219fb2f18f8Sesaxe pg_cmt_hw(pghw_type_t hw)
220fb2f18f8Sesaxe {
221d129bde2Sesaxe 	return (pg_plat_cmt_load_bal_hw(hw) ||
222d129bde2Sesaxe 	    pg_plat_cmt_affinity_hw(hw));
223fb2f18f8Sesaxe }
224fb2f18f8Sesaxe 
225fb2f18f8Sesaxe /*
226fb2f18f8Sesaxe  * CMT class callback for a new CPU entering the system
227fb2f18f8Sesaxe  */
228fb2f18f8Sesaxe static void
229fb2f18f8Sesaxe pg_cmt_cpu_init(cpu_t *cp)
230fb2f18f8Sesaxe {
231fb2f18f8Sesaxe 	pg_cmt_t	*pg;
232fb2f18f8Sesaxe 	group_t		*cmt_pgs;
233fb2f18f8Sesaxe 	int		level, max_level, nlevels;
234fb2f18f8Sesaxe 	pghw_type_t	hw;
235fb2f18f8Sesaxe 	pg_t		*pg_cache = NULL;
236fb2f18f8Sesaxe 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
237fb2f18f8Sesaxe 	lgrp_handle_t	lgrp_handle;
238fb2f18f8Sesaxe 	cmt_lgrp_t	*lgrp;
239fb2f18f8Sesaxe 
240fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
241fb2f18f8Sesaxe 
242fb2f18f8Sesaxe 	/*
243fb2f18f8Sesaxe 	 * A new CPU is coming into the system.
244fb2f18f8Sesaxe 	 * Interrogate the platform to see if the CPU
245fb2f18f8Sesaxe 	 * has any performance relevant CMT sharing
246fb2f18f8Sesaxe 	 * relationships
247fb2f18f8Sesaxe 	 */
248fb2f18f8Sesaxe 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
249fb2f18f8Sesaxe 	cp->cpu_pg->cmt_lineage = NULL;
250fb2f18f8Sesaxe 
251fb2f18f8Sesaxe 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
252fb2f18f8Sesaxe 	max_level = nlevels = 0;
253fb2f18f8Sesaxe 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
254fb2f18f8Sesaxe 
255fb2f18f8Sesaxe 		/*
256fb2f18f8Sesaxe 		 * We're only interested in CMT hw sharing relationships
257fb2f18f8Sesaxe 		 */
258fb2f18f8Sesaxe 		if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0)
259fb2f18f8Sesaxe 			continue;
260fb2f18f8Sesaxe 
261fb2f18f8Sesaxe 		/*
262fb2f18f8Sesaxe 		 * Find (or create) the PG associated with
263fb2f18f8Sesaxe 		 * the hw sharing relationship in which cp
264fb2f18f8Sesaxe 		 * belongs.
265fb2f18f8Sesaxe 		 *
266fb2f18f8Sesaxe 		 * Determine if a suitable PG already
267fb2f18f8Sesaxe 		 * exists, or if one needs to be created.
268fb2f18f8Sesaxe 		 */
269fb2f18f8Sesaxe 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
270fb2f18f8Sesaxe 		if (pg == NULL) {
271fb2f18f8Sesaxe 			/*
272fb2f18f8Sesaxe 			 * Create a new one.
273fb2f18f8Sesaxe 			 * Initialize the common...
274fb2f18f8Sesaxe 			 */
275fb2f18f8Sesaxe 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
276fb2f18f8Sesaxe 
277fb2f18f8Sesaxe 			/* ... physical ... */
278fb2f18f8Sesaxe 			pghw_init((pghw_t *)pg, cp, hw);
279fb2f18f8Sesaxe 
280fb2f18f8Sesaxe 			/*
281fb2f18f8Sesaxe 			 * ... and CMT specific portions of the
282fb2f18f8Sesaxe 			 * structure.
283fb2f18f8Sesaxe 			 */
284fb2f18f8Sesaxe 			bitset_init(&pg->cmt_cpus_actv_set);
285fb2f18f8Sesaxe 			group_create(&pg->cmt_cpus_actv);
286fb2f18f8Sesaxe 		} else {
287fb2f18f8Sesaxe 			ASSERT(IS_CMT_PG(pg));
288fb2f18f8Sesaxe 		}
289fb2f18f8Sesaxe 
290fb2f18f8Sesaxe 		/* Add the CPU to the PG */
291fb2f18f8Sesaxe 		pg_cpu_add((pg_t *)pg, cp);
292fb2f18f8Sesaxe 
293fb2f18f8Sesaxe 		/*
2946890d023SEric Saxe 		 * Ensure capacity of the active CPU group/bitset
295fb2f18f8Sesaxe 		 */
296fb2f18f8Sesaxe 		group_expand(&pg->cmt_cpus_actv,
297fb2f18f8Sesaxe 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
298fb2f18f8Sesaxe 
299fb2f18f8Sesaxe 		if (cp->cpu_seqid >=
300fb2f18f8Sesaxe 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
301fb2f18f8Sesaxe 			bitset_resize(&pg->cmt_cpus_actv_set,
302fb2f18f8Sesaxe 			    cp->cpu_seqid + 1);
303fb2f18f8Sesaxe 		}
304fb2f18f8Sesaxe 
305fb2f18f8Sesaxe 		/*
306fb2f18f8Sesaxe 		 * Build a lineage of CMT PGs for load balancing
307fb2f18f8Sesaxe 		 */
308d129bde2Sesaxe 		if (pg_plat_cmt_load_bal_hw(hw)) {
309fb2f18f8Sesaxe 			level = pghw_level(hw);
310fb2f18f8Sesaxe 			cpu_cmt_hier[level] = pg;
311fb2f18f8Sesaxe 			if (level > max_level)
312fb2f18f8Sesaxe 				max_level = level;
313fb2f18f8Sesaxe 			nlevels++;
314fb2f18f8Sesaxe 		}
315fb2f18f8Sesaxe 
316fb2f18f8Sesaxe 		/* Cache this for later */
317fb2f18f8Sesaxe 		if (hw == PGHW_CACHE)
318fb2f18f8Sesaxe 			pg_cache = (pg_t *)pg;
319fb2f18f8Sesaxe 	}
320fb2f18f8Sesaxe 
321fb2f18f8Sesaxe 	/*
3226890d023SEric Saxe 	 * Pack out any gaps in the constructed lineage,
3236890d023SEric Saxe 	 * then size it out.
3246890d023SEric Saxe 	 *
325fb2f18f8Sesaxe 	 * Gaps may exist where the architecture knows
326fb2f18f8Sesaxe 	 * about a hardware sharing relationship, but such a
327fb2f18f8Sesaxe 	 * relationship either isn't relevant for load
328fb2f18f8Sesaxe 	 * balancing or doesn't exist between CPUs on the system.
329fb2f18f8Sesaxe 	 */
3306890d023SEric Saxe 	pg_cmt_hier_pack((void **)cpu_cmt_hier, max_level + 1);
3316890d023SEric Saxe 	group_expand(cmt_pgs, nlevels);
3326890d023SEric Saxe 
3336890d023SEric Saxe 
3346890d023SEric Saxe 	if (cmt_root == NULL)
3356890d023SEric Saxe 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
336fb2f18f8Sesaxe 
337fb2f18f8Sesaxe 	/*
3386890d023SEric Saxe 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy.
3396890d023SEric Saxe 	 * and locate/create a suitable cmt_lgrp_t.
3406890d023SEric Saxe 	 */
3416890d023SEric Saxe 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
3426890d023SEric Saxe 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
3436890d023SEric Saxe 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
3446890d023SEric Saxe 
3456890d023SEric Saxe 	/*
3466890d023SEric Saxe 	 * For each of the PGs in the CPU's lineage:
3476890d023SEric Saxe 	 *	- Add an entry in the CPU's CMT PG group
3486890d023SEric Saxe 	 *	  which is used by the dispatcher to implement load balancing
3496890d023SEric Saxe 	 *	  policy.
350fb2f18f8Sesaxe 	 *	- Tie the PG into the CMT hierarchy by connecting
351fb2f18f8Sesaxe 	 *	  it to it's parent and siblings.
352fb2f18f8Sesaxe 	 */
353fb2f18f8Sesaxe 	for (level = 0; level < nlevels; level++) {
354fb2f18f8Sesaxe 		uint_t		children;
355fb2f18f8Sesaxe 		int		err;
356fb2f18f8Sesaxe 
357fb2f18f8Sesaxe 		pg = cpu_cmt_hier[level];
358fb2f18f8Sesaxe 		err = group_add_at(cmt_pgs, pg, nlevels - level - 1);
359fb2f18f8Sesaxe 		ASSERT(err == 0);
360fb2f18f8Sesaxe 
361fb2f18f8Sesaxe 		if (level == 0)
362fb2f18f8Sesaxe 			cp->cpu_pg->cmt_lineage = (pg_t *)pg;
363fb2f18f8Sesaxe 
364fb2f18f8Sesaxe 		if (pg->cmt_siblings != NULL) {
365fb2f18f8Sesaxe 			/* Already initialized */
366fb2f18f8Sesaxe 			ASSERT(pg->cmt_parent == NULL ||
367fb2f18f8Sesaxe 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
368fb2f18f8Sesaxe 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
369c416da2dSjb145095 			    ((pg->cmt_parent != NULL) &&
370c416da2dSjb145095 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
371fb2f18f8Sesaxe 			continue;
372fb2f18f8Sesaxe 		}
373fb2f18f8Sesaxe 
374fb2f18f8Sesaxe 		if ((level + 1) == nlevels) {
375fb2f18f8Sesaxe 			pg->cmt_parent = NULL;
3766890d023SEric Saxe 
377fb2f18f8Sesaxe 			pg->cmt_siblings = &lgrp->cl_pgs;
378fb2f18f8Sesaxe 			children = ++lgrp->cl_npgs;
3796890d023SEric Saxe 			cmt_root->cl_npgs++;
380fb2f18f8Sesaxe 		} else {
381fb2f18f8Sesaxe 			pg->cmt_parent = cpu_cmt_hier[level + 1];
382fb2f18f8Sesaxe 
383fb2f18f8Sesaxe 			/*
384fb2f18f8Sesaxe 			 * A good parent keeps track of their children.
385fb2f18f8Sesaxe 			 * The parent's children group is also the PG's
386fb2f18f8Sesaxe 			 * siblings.
387fb2f18f8Sesaxe 			 */
388fb2f18f8Sesaxe 			if (pg->cmt_parent->cmt_children == NULL) {
389fb2f18f8Sesaxe 				pg->cmt_parent->cmt_children =
390fb2f18f8Sesaxe 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
391fb2f18f8Sesaxe 				group_create(pg->cmt_parent->cmt_children);
392fb2f18f8Sesaxe 			}
393fb2f18f8Sesaxe 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
394fb2f18f8Sesaxe 			children = ++pg->cmt_parent->cmt_nchildren;
395fb2f18f8Sesaxe 		}
3966890d023SEric Saxe 
397fb2f18f8Sesaxe 		group_expand(pg->cmt_siblings, children);
3986890d023SEric Saxe 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
399fb2f18f8Sesaxe 	}
400fb2f18f8Sesaxe 
401fb2f18f8Sesaxe 	/*
402fb2f18f8Sesaxe 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
403fb2f18f8Sesaxe 	 * for fast lookups later.
404fb2f18f8Sesaxe 	 */
405fb2f18f8Sesaxe 	if (cp->cpu_physid) {
406fb2f18f8Sesaxe 		cp->cpu_physid->cpu_chipid =
407fb2f18f8Sesaxe 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
408fb2f18f8Sesaxe 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
409fb2f18f8Sesaxe 
410fb2f18f8Sesaxe 		/*
411fb2f18f8Sesaxe 		 * If this cpu has a PG representing shared cache, then set
412fb2f18f8Sesaxe 		 * cpu_cacheid to that PG's logical id
413fb2f18f8Sesaxe 		 */
414fb2f18f8Sesaxe 		if (pg_cache)
415fb2f18f8Sesaxe 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
416fb2f18f8Sesaxe 	}
417fb2f18f8Sesaxe 
418fb2f18f8Sesaxe 	/* CPU0 only initialization */
419fb2f18f8Sesaxe 	if (is_cpu0) {
420fb2f18f8Sesaxe 		pg_cmt_cpu_startup(cp);
421fb2f18f8Sesaxe 		is_cpu0 = 0;
422a6604450Sesaxe 		cpu0_lgrp = lgrp;
423fb2f18f8Sesaxe 	}
424fb2f18f8Sesaxe 
425fb2f18f8Sesaxe }
426fb2f18f8Sesaxe 
427fb2f18f8Sesaxe /*
428fb2f18f8Sesaxe  * Class callback when a CPU is leaving the system (deletion)
429fb2f18f8Sesaxe  */
430fb2f18f8Sesaxe static void
431fb2f18f8Sesaxe pg_cmt_cpu_fini(cpu_t *cp)
432fb2f18f8Sesaxe {
433fb2f18f8Sesaxe 	group_iter_t	i;
434fb2f18f8Sesaxe 	pg_cmt_t	*pg;
435fb2f18f8Sesaxe 	group_t		*pgs, *cmt_pgs;
436fb2f18f8Sesaxe 	lgrp_handle_t	lgrp_handle;
437fb2f18f8Sesaxe 	cmt_lgrp_t	*lgrp;
438fb2f18f8Sesaxe 
439fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
440fb2f18f8Sesaxe 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
441fb2f18f8Sesaxe 
442fb2f18f8Sesaxe 	/*
443fb2f18f8Sesaxe 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
444fb2f18f8Sesaxe 	 */
445fb2f18f8Sesaxe 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
446a6604450Sesaxe 
447fb2f18f8Sesaxe 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
448*3e81cacfSEric Saxe 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
449a6604450Sesaxe 		/*
450*3e81cacfSEric Saxe 		 * One might wonder how we could be deconfiguring the
451*3e81cacfSEric Saxe 		 * only CPU in the system.
452a6604450Sesaxe 		 *
453*3e81cacfSEric Saxe 		 * On Starcat systems when null_proc_lpa is detected,
454*3e81cacfSEric Saxe 		 * the boot CPU (which is already configured into a leaf
455*3e81cacfSEric Saxe 		 * lgroup), is moved into the root lgroup. This is done by
456*3e81cacfSEric Saxe 		 * deconfiguring it from both lgroups and processor
457*3e81cacfSEric Saxe 		 * groups), and then later reconfiguring it back in.  This
458*3e81cacfSEric Saxe 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
459*3e81cacfSEric Saxe 		 *
460*3e81cacfSEric Saxe 		 * This special case is detected by noting that the platform
461*3e81cacfSEric Saxe 		 * has changed the CPU's lgrp affiliation (since it now
462*3e81cacfSEric Saxe 		 * belongs in the root). In this case, use the cmt_lgrp_t
463*3e81cacfSEric Saxe 		 * cached for the boot CPU, since this is what needs to be
464*3e81cacfSEric Saxe 		 * torn down.
465a6604450Sesaxe 		 */
466a6604450Sesaxe 		lgrp = cpu0_lgrp;
467a6604450Sesaxe 	}
468fb2f18f8Sesaxe 
469*3e81cacfSEric Saxe 	ASSERT(lgrp != NULL);
470*3e81cacfSEric Saxe 
471fb2f18f8Sesaxe 	/*
472fb2f18f8Sesaxe 	 * First, clean up anything load balancing specific for each of
473fb2f18f8Sesaxe 	 * the CPU's PGs that participated in CMT load balancing
474fb2f18f8Sesaxe 	 */
475fb2f18f8Sesaxe 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
476fb2f18f8Sesaxe 	while (pg != NULL) {
477fb2f18f8Sesaxe 
478fb2f18f8Sesaxe 		/*
479fb2f18f8Sesaxe 		 * Remove the PG from the CPU's load balancing lineage
480fb2f18f8Sesaxe 		 */
481fb2f18f8Sesaxe 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
482fb2f18f8Sesaxe 
483fb2f18f8Sesaxe 		/*
484fb2f18f8Sesaxe 		 * If it's about to become empty, destroy it's children
485fb2f18f8Sesaxe 		 * group, and remove it's reference from it's siblings.
486fb2f18f8Sesaxe 		 * This is done here (rather than below) to avoid removing
487fb2f18f8Sesaxe 		 * our reference from a PG that we just eliminated.
488fb2f18f8Sesaxe 		 */
489fb2f18f8Sesaxe 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
490fb2f18f8Sesaxe 			if (pg->cmt_children != NULL)
491fb2f18f8Sesaxe 				group_destroy(pg->cmt_children);
492fb2f18f8Sesaxe 			if (pg->cmt_siblings != NULL) {
493fb2f18f8Sesaxe 				if (pg->cmt_siblings == &lgrp->cl_pgs)
494fb2f18f8Sesaxe 					lgrp->cl_npgs--;
495fb2f18f8Sesaxe 				else
496fb2f18f8Sesaxe 					pg->cmt_parent->cmt_nchildren--;
497fb2f18f8Sesaxe 			}
498fb2f18f8Sesaxe 		}
499fb2f18f8Sesaxe 		pg = pg->cmt_parent;
500fb2f18f8Sesaxe 	}
501fb2f18f8Sesaxe 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
502fb2f18f8Sesaxe 
503fb2f18f8Sesaxe 	/*
504fb2f18f8Sesaxe 	 * Now that the load balancing lineage updates have happened,
505fb2f18f8Sesaxe 	 * remove the CPU from all it's PGs (destroying any that become
506fb2f18f8Sesaxe 	 * empty).
507fb2f18f8Sesaxe 	 */
508fb2f18f8Sesaxe 	group_iter_init(&i);
509fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
510fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
511fb2f18f8Sesaxe 			continue;
512fb2f18f8Sesaxe 
513fb2f18f8Sesaxe 		pg_cpu_delete((pg_t *)pg, cp);
514fb2f18f8Sesaxe 		/*
515fb2f18f8Sesaxe 		 * Deleting the CPU from the PG changes the CPU's
516fb2f18f8Sesaxe 		 * PG group over which we are actively iterating
517fb2f18f8Sesaxe 		 * Re-initialize the iteration
518fb2f18f8Sesaxe 		 */
519fb2f18f8Sesaxe 		group_iter_init(&i);
520fb2f18f8Sesaxe 
521fb2f18f8Sesaxe 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
522fb2f18f8Sesaxe 
523fb2f18f8Sesaxe 			/*
524fb2f18f8Sesaxe 			 * The PG has become zero sized, so destroy it.
525fb2f18f8Sesaxe 			 */
526fb2f18f8Sesaxe 			group_destroy(&pg->cmt_cpus_actv);
527fb2f18f8Sesaxe 			bitset_fini(&pg->cmt_cpus_actv_set);
528fb2f18f8Sesaxe 			pghw_fini((pghw_t *)pg);
529fb2f18f8Sesaxe 
530fb2f18f8Sesaxe 			pg_destroy((pg_t *)pg);
531fb2f18f8Sesaxe 		}
532fb2f18f8Sesaxe 	}
533fb2f18f8Sesaxe }
534fb2f18f8Sesaxe 
535fb2f18f8Sesaxe /*
536fb2f18f8Sesaxe  * Class callback when a CPU is entering a cpu partition
537fb2f18f8Sesaxe  */
538fb2f18f8Sesaxe static void
539fb2f18f8Sesaxe pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
540fb2f18f8Sesaxe {
541fb2f18f8Sesaxe 	group_t		*pgs;
542fb2f18f8Sesaxe 	pg_t		*pg;
543fb2f18f8Sesaxe 	group_iter_t	i;
544fb2f18f8Sesaxe 
545fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
546fb2f18f8Sesaxe 
547fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
548fb2f18f8Sesaxe 
549fb2f18f8Sesaxe 	/*
550fb2f18f8Sesaxe 	 * Ensure that the new partition's PG bitset
551fb2f18f8Sesaxe 	 * is large enough for all CMT PG's to which cp
552fb2f18f8Sesaxe 	 * belongs
553fb2f18f8Sesaxe 	 */
554fb2f18f8Sesaxe 	group_iter_init(&i);
555fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
556fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
557fb2f18f8Sesaxe 			continue;
558fb2f18f8Sesaxe 
559fb2f18f8Sesaxe 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
560fb2f18f8Sesaxe 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
561fb2f18f8Sesaxe 	}
562fb2f18f8Sesaxe }
563fb2f18f8Sesaxe 
564fb2f18f8Sesaxe /*
565fb2f18f8Sesaxe  * Class callback when a CPU is actually moving partitions
566fb2f18f8Sesaxe  */
567fb2f18f8Sesaxe static void
568fb2f18f8Sesaxe pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
569fb2f18f8Sesaxe {
570fb2f18f8Sesaxe 	cpu_t		*cpp;
571fb2f18f8Sesaxe 	group_t		*pgs;
572fb2f18f8Sesaxe 	pg_t		*pg;
573fb2f18f8Sesaxe 	group_iter_t	pg_iter;
574fb2f18f8Sesaxe 	pg_cpu_itr_t	cpu_iter;
575fb2f18f8Sesaxe 	boolean_t	found;
576fb2f18f8Sesaxe 
577fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
578fb2f18f8Sesaxe 
579fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
580fb2f18f8Sesaxe 	group_iter_init(&pg_iter);
581fb2f18f8Sesaxe 
582fb2f18f8Sesaxe 	/*
583fb2f18f8Sesaxe 	 * Iterate over the CPUs CMT PGs
584fb2f18f8Sesaxe 	 */
585fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
586fb2f18f8Sesaxe 
587fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
588fb2f18f8Sesaxe 			continue;
589fb2f18f8Sesaxe 
590fb2f18f8Sesaxe 		/*
591fb2f18f8Sesaxe 		 * Add the PG to the bitset in the new partition.
592fb2f18f8Sesaxe 		 */
593fb2f18f8Sesaxe 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
594fb2f18f8Sesaxe 
595fb2f18f8Sesaxe 		/*
596fb2f18f8Sesaxe 		 * Remove the PG from the bitset in the old partition
597fb2f18f8Sesaxe 		 * if the last of the PG's CPUs have left.
598fb2f18f8Sesaxe 		 */
599fb2f18f8Sesaxe 		found = B_FALSE;
600fb2f18f8Sesaxe 		PG_CPU_ITR_INIT(pg, cpu_iter);
601fb2f18f8Sesaxe 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
602fb2f18f8Sesaxe 			if (cpp == cp)
603fb2f18f8Sesaxe 				continue;
604a6604450Sesaxe 			if (CPU_ACTIVE(cpp) &&
605a6604450Sesaxe 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
606fb2f18f8Sesaxe 				found = B_TRUE;
607fb2f18f8Sesaxe 				break;
608fb2f18f8Sesaxe 			}
609fb2f18f8Sesaxe 		}
610fb2f18f8Sesaxe 		if (!found)
611fb2f18f8Sesaxe 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
612fb2f18f8Sesaxe 	}
613fb2f18f8Sesaxe }
614fb2f18f8Sesaxe 
615fb2f18f8Sesaxe /*
616fb2f18f8Sesaxe  * Class callback when a CPU becomes active (online)
617fb2f18f8Sesaxe  *
618fb2f18f8Sesaxe  * This is called in a context where CPUs are paused
619fb2f18f8Sesaxe  */
620fb2f18f8Sesaxe static void
621fb2f18f8Sesaxe pg_cmt_cpu_active(cpu_t *cp)
622fb2f18f8Sesaxe {
623fb2f18f8Sesaxe 	int		err;
624fb2f18f8Sesaxe 	group_iter_t	i;
625fb2f18f8Sesaxe 	pg_cmt_t	*pg;
626fb2f18f8Sesaxe 	group_t		*pgs;
627fb2f18f8Sesaxe 
628fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
629fb2f18f8Sesaxe 
630fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
631fb2f18f8Sesaxe 	group_iter_init(&i);
632fb2f18f8Sesaxe 
633fb2f18f8Sesaxe 	/*
634fb2f18f8Sesaxe 	 * Iterate over the CPU's PGs
635fb2f18f8Sesaxe 	 */
636fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
637fb2f18f8Sesaxe 
638fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
639fb2f18f8Sesaxe 			continue;
640fb2f18f8Sesaxe 
641fb2f18f8Sesaxe 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
642fb2f18f8Sesaxe 		ASSERT(err == 0);
643fb2f18f8Sesaxe 
644fb2f18f8Sesaxe 		/*
645fb2f18f8Sesaxe 		 * If this is the first active CPU in the PG, and it
646fb2f18f8Sesaxe 		 * represents a hardware sharing relationship over which
647fb2f18f8Sesaxe 		 * CMT load balancing is performed, add it as a candidate
648fb2f18f8Sesaxe 		 * for balancing with it's siblings.
649fb2f18f8Sesaxe 		 */
650fb2f18f8Sesaxe 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
651d129bde2Sesaxe 		    pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
652fb2f18f8Sesaxe 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
653fb2f18f8Sesaxe 			ASSERT(err == 0);
6546890d023SEric Saxe 
6556890d023SEric Saxe 			/*
6566890d023SEric Saxe 			 * If this is a top level PG, add it as a balancing
6576890d023SEric Saxe 			 * candidate when balancing within the root lgroup
6586890d023SEric Saxe 			 */
6596890d023SEric Saxe 			if (pg->cmt_parent == NULL) {
6606890d023SEric Saxe 				err = group_add(&cmt_root->cl_pgs, pg,
6616890d023SEric Saxe 				    GRP_NORESIZE);
6626890d023SEric Saxe 				ASSERT(err == 0);
6636890d023SEric Saxe 			}
664fb2f18f8Sesaxe 		}
665fb2f18f8Sesaxe 
666fb2f18f8Sesaxe 		/*
667fb2f18f8Sesaxe 		 * Notate the CPU in the PGs active CPU bitset.
668fb2f18f8Sesaxe 		 * Also notate the PG as being active in it's associated
669fb2f18f8Sesaxe 		 * partition
670fb2f18f8Sesaxe 		 */
671fb2f18f8Sesaxe 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
672fb2f18f8Sesaxe 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
673fb2f18f8Sesaxe 	}
674fb2f18f8Sesaxe }
675fb2f18f8Sesaxe 
676fb2f18f8Sesaxe /*
677fb2f18f8Sesaxe  * Class callback when a CPU goes inactive (offline)
678fb2f18f8Sesaxe  *
679fb2f18f8Sesaxe  * This is called in a context where CPUs are paused
680fb2f18f8Sesaxe  */
681fb2f18f8Sesaxe static void
682fb2f18f8Sesaxe pg_cmt_cpu_inactive(cpu_t *cp)
683fb2f18f8Sesaxe {
684fb2f18f8Sesaxe 	int		err;
685fb2f18f8Sesaxe 	group_t		*pgs;
686fb2f18f8Sesaxe 	pg_cmt_t	*pg;
687fb2f18f8Sesaxe 	cpu_t		*cpp;
688fb2f18f8Sesaxe 	group_iter_t	i;
689fb2f18f8Sesaxe 	pg_cpu_itr_t	cpu_itr;
690fb2f18f8Sesaxe 	boolean_t	found;
691fb2f18f8Sesaxe 
692fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
693fb2f18f8Sesaxe 
694fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
695fb2f18f8Sesaxe 	group_iter_init(&i);
696fb2f18f8Sesaxe 
697fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
698fb2f18f8Sesaxe 
699fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
700fb2f18f8Sesaxe 			continue;
701fb2f18f8Sesaxe 
702fb2f18f8Sesaxe 		/*
703fb2f18f8Sesaxe 		 * Remove the CPU from the CMT PGs active CPU group
704fb2f18f8Sesaxe 		 * bitmap
705fb2f18f8Sesaxe 		 */
706fb2f18f8Sesaxe 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
707fb2f18f8Sesaxe 		ASSERT(err == 0);
708fb2f18f8Sesaxe 
709fb2f18f8Sesaxe 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
710fb2f18f8Sesaxe 
711fb2f18f8Sesaxe 		/*
712fb2f18f8Sesaxe 		 * If there are no more active CPUs in this PG over which
713fb2f18f8Sesaxe 		 * load was balanced, remove it as a balancing candidate.
714fb2f18f8Sesaxe 		 */
715fb2f18f8Sesaxe 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
716d129bde2Sesaxe 		    pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) {
717fb2f18f8Sesaxe 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
718fb2f18f8Sesaxe 			ASSERT(err == 0);
7196890d023SEric Saxe 
7206890d023SEric Saxe 			if (pg->cmt_parent == NULL) {
7216890d023SEric Saxe 				err = group_remove(&cmt_root->cl_pgs, pg,
7226890d023SEric Saxe 				    GRP_NORESIZE);
7236890d023SEric Saxe 				ASSERT(err == 0);
7246890d023SEric Saxe 			}
725fb2f18f8Sesaxe 		}
726fb2f18f8Sesaxe 
727fb2f18f8Sesaxe 		/*
728fb2f18f8Sesaxe 		 * Assert the number of active CPUs does not exceed
729fb2f18f8Sesaxe 		 * the total number of CPUs in the PG
730fb2f18f8Sesaxe 		 */
731fb2f18f8Sesaxe 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
732fb2f18f8Sesaxe 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
733fb2f18f8Sesaxe 
734fb2f18f8Sesaxe 		/*
735fb2f18f8Sesaxe 		 * Update the PG bitset in the CPU's old partition
736fb2f18f8Sesaxe 		 */
737fb2f18f8Sesaxe 		found = B_FALSE;
738fb2f18f8Sesaxe 		PG_CPU_ITR_INIT(pg, cpu_itr);
739fb2f18f8Sesaxe 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
740fb2f18f8Sesaxe 			if (cpp == cp)
741fb2f18f8Sesaxe 				continue;
742a6604450Sesaxe 			if (CPU_ACTIVE(cpp) &&
743a6604450Sesaxe 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
744fb2f18f8Sesaxe 				found = B_TRUE;
745fb2f18f8Sesaxe 				break;
746fb2f18f8Sesaxe 			}
747fb2f18f8Sesaxe 		}
748fb2f18f8Sesaxe 		if (!found) {
749fb2f18f8Sesaxe 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
750fb2f18f8Sesaxe 			    ((pg_t *)pg)->pg_id);
751fb2f18f8Sesaxe 		}
752fb2f18f8Sesaxe 	}
753fb2f18f8Sesaxe }
754fb2f18f8Sesaxe 
755fb2f18f8Sesaxe /*
756fb2f18f8Sesaxe  * Return non-zero if the CPU belongs in the given PG
757fb2f18f8Sesaxe  */
758fb2f18f8Sesaxe static int
759fb2f18f8Sesaxe pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
760fb2f18f8Sesaxe {
761fb2f18f8Sesaxe 	cpu_t	*pg_cpu;
762fb2f18f8Sesaxe 
763fb2f18f8Sesaxe 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
764fb2f18f8Sesaxe 
765fb2f18f8Sesaxe 	ASSERT(pg_cpu != NULL);
766fb2f18f8Sesaxe 
767fb2f18f8Sesaxe 	/*
768fb2f18f8Sesaxe 	 * The CPU belongs if, given the nature of the hardware sharing
769fb2f18f8Sesaxe 	 * relationship represented by the PG, the CPU has that
770fb2f18f8Sesaxe 	 * relationship with some other CPU already in the PG
771fb2f18f8Sesaxe 	 */
772fb2f18f8Sesaxe 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
773fb2f18f8Sesaxe 		return (1);
774fb2f18f8Sesaxe 
775fb2f18f8Sesaxe 	return (0);
776fb2f18f8Sesaxe }
777fb2f18f8Sesaxe 
778fb2f18f8Sesaxe /*
7796890d023SEric Saxe  * Hierarchy packing utility routine. The hierarchy order is preserved.
780fb2f18f8Sesaxe  */
781fb2f18f8Sesaxe static void
7826890d023SEric Saxe pg_cmt_hier_pack(void *hier[], int sz)
783fb2f18f8Sesaxe {
784fb2f18f8Sesaxe 	int	i, j;
785fb2f18f8Sesaxe 
786fb2f18f8Sesaxe 	for (i = 0; i < sz; i++) {
787fb2f18f8Sesaxe 		if (hier[i] != NULL)
788fb2f18f8Sesaxe 			continue;
789fb2f18f8Sesaxe 
790fb2f18f8Sesaxe 		for (j = i; j < sz; j++) {
791fb2f18f8Sesaxe 			if (hier[j] != NULL) {
792fb2f18f8Sesaxe 				hier[i] = hier[j];
793fb2f18f8Sesaxe 				hier[j] = NULL;
794fb2f18f8Sesaxe 				break;
795fb2f18f8Sesaxe 			}
796fb2f18f8Sesaxe 		}
797fb2f18f8Sesaxe 		if (j == sz)
798fb2f18f8Sesaxe 			break;
799fb2f18f8Sesaxe 	}
800fb2f18f8Sesaxe }
801fb2f18f8Sesaxe 
802fb2f18f8Sesaxe /*
803fb2f18f8Sesaxe  * Return a cmt_lgrp_t * given an lgroup handle.
804fb2f18f8Sesaxe  */
805fb2f18f8Sesaxe static cmt_lgrp_t *
806fb2f18f8Sesaxe pg_cmt_find_lgrp(lgrp_handle_t hand)
807fb2f18f8Sesaxe {
808fb2f18f8Sesaxe 	cmt_lgrp_t	*lgrp;
809fb2f18f8Sesaxe 
810fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
811fb2f18f8Sesaxe 
812fb2f18f8Sesaxe 	lgrp = cmt_lgrps;
813fb2f18f8Sesaxe 	while (lgrp != NULL) {
814fb2f18f8Sesaxe 		if (lgrp->cl_hand == hand)
815a6604450Sesaxe 			break;
816fb2f18f8Sesaxe 		lgrp = lgrp->cl_next;
817fb2f18f8Sesaxe 	}
818a6604450Sesaxe 	return (lgrp);
819a6604450Sesaxe }
820fb2f18f8Sesaxe 
821fb2f18f8Sesaxe /*
822a6604450Sesaxe  * Create a cmt_lgrp_t with the specified handle.
823fb2f18f8Sesaxe  */
824a6604450Sesaxe static cmt_lgrp_t *
825a6604450Sesaxe pg_cmt_lgrp_create(lgrp_handle_t hand)
826a6604450Sesaxe {
827a6604450Sesaxe 	cmt_lgrp_t	*lgrp;
828a6604450Sesaxe 
829a6604450Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
830a6604450Sesaxe 
831fb2f18f8Sesaxe 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
832fb2f18f8Sesaxe 
833fb2f18f8Sesaxe 	lgrp->cl_hand = hand;
834fb2f18f8Sesaxe 	lgrp->cl_npgs = 0;
835fb2f18f8Sesaxe 	lgrp->cl_next = cmt_lgrps;
836fb2f18f8Sesaxe 	cmt_lgrps = lgrp;
837fb2f18f8Sesaxe 	group_create(&lgrp->cl_pgs);
838fb2f18f8Sesaxe 
839fb2f18f8Sesaxe 	return (lgrp);
840fb2f18f8Sesaxe }
8416890d023SEric Saxe 
8426890d023SEric Saxe /*
8436890d023SEric Saxe  * Perform multi-level CMT load balancing of running threads.
8446890d023SEric Saxe  *
8456890d023SEric Saxe  * tp is the thread being enqueued.
8466890d023SEric Saxe  * cp is a hint CPU, against which CMT load balancing will be performed.
8476890d023SEric Saxe  *
8486890d023SEric Saxe  * Returns cp, or a CPU better than cp with respect to balancing
8496890d023SEric Saxe  * running thread load.
8506890d023SEric Saxe  */
8516890d023SEric Saxe cpu_t *
8526890d023SEric Saxe cmt_balance(kthread_t *tp, cpu_t *cp)
8536890d023SEric Saxe {
8546890d023SEric Saxe 	int		hint, i, cpu, nsiblings;
8556890d023SEric Saxe 	int		self = 0;
8566890d023SEric Saxe 	group_t		*cmt_pgs, *siblings;
8576890d023SEric Saxe 	pg_cmt_t	*pg, *pg_tmp, *tpg = NULL;
8586890d023SEric Saxe 	int		pg_nrun, tpg_nrun;
8596890d023SEric Saxe 	int		level = 0;
8606890d023SEric Saxe 	cpu_t		*newcp;
8616890d023SEric Saxe 
8626890d023SEric Saxe 	ASSERT(THREAD_LOCK_HELD(tp));
8636890d023SEric Saxe 
8646890d023SEric Saxe 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
8656890d023SEric Saxe 
8666890d023SEric Saxe 	if (GROUP_SIZE(cmt_pgs) == 0)
8676890d023SEric Saxe 		return (cp);	/* nothing to do */
8686890d023SEric Saxe 
8696890d023SEric Saxe 	if (tp == curthread)
8706890d023SEric Saxe 		self = 1;
8716890d023SEric Saxe 
8726890d023SEric Saxe 	/*
8736890d023SEric Saxe 	 * Balance across siblings in the CPUs CMT lineage
8746890d023SEric Saxe 	 * If the thread is homed to the root lgroup, perform
8756890d023SEric Saxe 	 * top level balancing against other top level PGs
8766890d023SEric Saxe 	 * in the system. Otherwise, start with the default
8776890d023SEric Saxe 	 * top level siblings group, which is within the leaf lgroup
8786890d023SEric Saxe 	 */
8796890d023SEric Saxe 	pg = GROUP_ACCESS(cmt_pgs, level);
8806890d023SEric Saxe 	if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
8816890d023SEric Saxe 		siblings = &cmt_root->cl_pgs;
8826890d023SEric Saxe 	else
8836890d023SEric Saxe 		siblings = pg->cmt_siblings;
8846890d023SEric Saxe 
8856890d023SEric Saxe 	/*
8866890d023SEric Saxe 	 * Traverse down the lineage until we find a level that needs
8876890d023SEric Saxe 	 * balancing, or we get to the end.
8886890d023SEric Saxe 	 */
8896890d023SEric Saxe 	for (;;) {
8906890d023SEric Saxe 		nsiblings = GROUP_SIZE(siblings);	/* self inclusive */
8916890d023SEric Saxe 		if (nsiblings == 1)
8926890d023SEric Saxe 			goto next_level;
8936890d023SEric Saxe 
8946890d023SEric Saxe 		pg_nrun = pg->cmt_nrunning;
8956890d023SEric Saxe 		if (self &&
8966890d023SEric Saxe 		    bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid))
8976890d023SEric Saxe 			pg_nrun--;	/* Ignore curthread's effect */
8986890d023SEric Saxe 
8996890d023SEric Saxe 		hint = CPU_PSEUDO_RANDOM() % nsiblings;
9006890d023SEric Saxe 
9016890d023SEric Saxe 		/*
9026890d023SEric Saxe 		 * Find a balancing candidate from among our siblings
9036890d023SEric Saxe 		 * "hint" is a hint for where to start looking
9046890d023SEric Saxe 		 */
9056890d023SEric Saxe 		i = hint;
9066890d023SEric Saxe 		do {
9076890d023SEric Saxe 			ASSERT(i < nsiblings);
9086890d023SEric Saxe 			pg_tmp = GROUP_ACCESS(siblings, i);
9096890d023SEric Saxe 
9106890d023SEric Saxe 			/*
9116890d023SEric Saxe 			 * The candidate must not be us, and must
9126890d023SEric Saxe 			 * have some CPU resources in the thread's
9136890d023SEric Saxe 			 * partition
9146890d023SEric Saxe 			 */
9156890d023SEric Saxe 			if (pg_tmp != pg &&
9166890d023SEric Saxe 			    bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
9176890d023SEric Saxe 			    ((pg_t *)pg_tmp)->pg_id)) {
9186890d023SEric Saxe 				tpg = pg_tmp;
9196890d023SEric Saxe 				break;
9206890d023SEric Saxe 			}
9216890d023SEric Saxe 
9226890d023SEric Saxe 			if (++i >= nsiblings)
9236890d023SEric Saxe 				i = 0;
9246890d023SEric Saxe 		} while (i != hint);
9256890d023SEric Saxe 
9266890d023SEric Saxe 		if (!tpg)
9276890d023SEric Saxe 			goto next_level; /* no candidates at this level */
9286890d023SEric Saxe 
9296890d023SEric Saxe 		/*
9306890d023SEric Saxe 		 * Check if the balancing target is underloaded
9316890d023SEric Saxe 		 * Decide to balance if the target is running fewer
9326890d023SEric Saxe 		 * threads, or if it's running the same number of threads
9336890d023SEric Saxe 		 * with more online CPUs
9346890d023SEric Saxe 		 */
9356890d023SEric Saxe 		tpg_nrun = tpg->cmt_nrunning;
9366890d023SEric Saxe 		if (pg_nrun > tpg_nrun ||
9376890d023SEric Saxe 		    (pg_nrun == tpg_nrun &&
9386890d023SEric Saxe 		    (GROUP_SIZE(&tpg->cmt_cpus_actv) >
9396890d023SEric Saxe 		    GROUP_SIZE(&pg->cmt_cpus_actv)))) {
9406890d023SEric Saxe 			break;
9416890d023SEric Saxe 		}
9426890d023SEric Saxe 		tpg = NULL;
9436890d023SEric Saxe 
9446890d023SEric Saxe next_level:
9456890d023SEric Saxe 		if (++level == GROUP_SIZE(cmt_pgs))
9466890d023SEric Saxe 			break;
9476890d023SEric Saxe 
9486890d023SEric Saxe 		pg = GROUP_ACCESS(cmt_pgs, level);
9496890d023SEric Saxe 		siblings = pg->cmt_siblings;
9506890d023SEric Saxe 	}
9516890d023SEric Saxe 
9526890d023SEric Saxe 	if (tpg) {
9536890d023SEric Saxe 		uint_t	tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv);
9546890d023SEric Saxe 
9556890d023SEric Saxe 		/*
9566890d023SEric Saxe 		 * Select an idle CPU from the target
9576890d023SEric Saxe 		 */
9586890d023SEric Saxe 		hint = CPU_PSEUDO_RANDOM() % tgt_size;
9596890d023SEric Saxe 		cpu = hint;
9606890d023SEric Saxe 		do {
9616890d023SEric Saxe 			newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
9626890d023SEric Saxe 			if (newcp->cpu_part == tp->t_cpupart &&
9636890d023SEric Saxe 			    newcp->cpu_dispatch_pri == -1) {
9646890d023SEric Saxe 				cp = newcp;
9656890d023SEric Saxe 				break;
9666890d023SEric Saxe 			}
9676890d023SEric Saxe 			if (++cpu == tgt_size)
9686890d023SEric Saxe 				cpu = 0;
9696890d023SEric Saxe 		} while (cpu != hint);
9706890d023SEric Saxe 	}
9716890d023SEric Saxe 
9726890d023SEric Saxe 	return (cp);
9736890d023SEric Saxe }
974