xref: /titanic_51/usr/src/uts/common/disp/cmt.c (revision 0e7515250c8395f368aa45fb9acae7c4f8f8b786)
1fb2f18f8Sesaxe /*
2fb2f18f8Sesaxe  * CDDL HEADER START
3fb2f18f8Sesaxe  *
4fb2f18f8Sesaxe  * The contents of this file are subject to the terms of the
5fb2f18f8Sesaxe  * Common Development and Distribution License (the "License").
6fb2f18f8Sesaxe  * You may not use this file except in compliance with the License.
7fb2f18f8Sesaxe  *
8fb2f18f8Sesaxe  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fb2f18f8Sesaxe  * or http://www.opensolaris.org/os/licensing.
10fb2f18f8Sesaxe  * See the License for the specific language governing permissions
11fb2f18f8Sesaxe  * and limitations under the License.
12fb2f18f8Sesaxe  *
13fb2f18f8Sesaxe  * When distributing Covered Code, include this CDDL HEADER in each
14fb2f18f8Sesaxe  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fb2f18f8Sesaxe  * If applicable, add the following below this CDDL HEADER, with the
16fb2f18f8Sesaxe  * fields enclosed by brackets "[]" replaced with your own identifying
17fb2f18f8Sesaxe  * information: Portions Copyright [yyyy] [name of copyright owner]
18fb2f18f8Sesaxe  *
19fb2f18f8Sesaxe  * CDDL HEADER END
20fb2f18f8Sesaxe  */
21fb2f18f8Sesaxe /*
223e81cacfSEric Saxe  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23fb2f18f8Sesaxe  * Use is subject to license terms.
24fb2f18f8Sesaxe  */
25fb2f18f8Sesaxe 
26fb2f18f8Sesaxe #include <sys/systm.h>
27fb2f18f8Sesaxe #include <sys/types.h>
28fb2f18f8Sesaxe #include <sys/param.h>
29fb2f18f8Sesaxe #include <sys/thread.h>
30fb2f18f8Sesaxe #include <sys/cpuvar.h>
31fb2f18f8Sesaxe #include <sys/cpupart.h>
32fb2f18f8Sesaxe #include <sys/kmem.h>
33fb2f18f8Sesaxe #include <sys/cmn_err.h>
34fb2f18f8Sesaxe #include <sys/kstat.h>
35fb2f18f8Sesaxe #include <sys/processor.h>
36fb2f18f8Sesaxe #include <sys/disp.h>
37fb2f18f8Sesaxe #include <sys/group.h>
38fb2f18f8Sesaxe #include <sys/pghw.h>
39fb2f18f8Sesaxe #include <sys/bitset.h>
40fb2f18f8Sesaxe #include <sys/lgrp.h>
41fb2f18f8Sesaxe #include <sys/cmt.h>
42*0e751525SEric Saxe #include <sys/cpu_pm.h>
43fb2f18f8Sesaxe 
44fb2f18f8Sesaxe /*
45fb2f18f8Sesaxe  * CMT scheduler / dispatcher support
46fb2f18f8Sesaxe  *
47fb2f18f8Sesaxe  * This file implements CMT scheduler support using Processor Groups.
48fb2f18f8Sesaxe  * The CMT processor group class creates and maintains the CMT class
49fb2f18f8Sesaxe  * specific processor group pg_cmt_t.
50fb2f18f8Sesaxe  *
51fb2f18f8Sesaxe  * ---------------------------- <-- pg_cmt_t *
52fb2f18f8Sesaxe  * | pghw_t                   |
53fb2f18f8Sesaxe  * ----------------------------
54fb2f18f8Sesaxe  * | CMT class specific data  |
55fb2f18f8Sesaxe  * | - hierarchy linkage      |
56fb2f18f8Sesaxe  * | - CMT load balancing data|
57fb2f18f8Sesaxe  * | - active CPU group/bitset|
58fb2f18f8Sesaxe  * ----------------------------
59fb2f18f8Sesaxe  *
60fb2f18f8Sesaxe  * The scheduler/dispatcher leverages knowledge of the performance
61fb2f18f8Sesaxe  * relevant CMT sharing relationships existing between cpus to implement
62*0e751525SEric Saxe  * optimized affinity, load balancing, and coalescence policies.
63fb2f18f8Sesaxe  *
64fb2f18f8Sesaxe  * Load balancing policy seeks to improve performance by minimizing
65*0e751525SEric Saxe  * contention over shared processor resources / facilities, Affinity
66*0e751525SEric Saxe  * policies seek to improve cache and TLB utilization. Coalescence
67*0e751525SEric Saxe  * policies improve resource utilization and ultimately power efficiency.
68fb2f18f8Sesaxe  *
69fb2f18f8Sesaxe  * The CMT PGs created by this class are already arranged into a
70fb2f18f8Sesaxe  * hierarchy (which is done in the pghw layer). To implement the top-down
71fb2f18f8Sesaxe  * CMT load balancing algorithm, the CMT PGs additionally maintain
72fb2f18f8Sesaxe  * parent, child and sibling hierarchy relationships.
73fb2f18f8Sesaxe  * Parent PGs always contain a superset of their children(s) resources,
74fb2f18f8Sesaxe  * each PG can have at most one parent, and siblings are the group of PGs
75fb2f18f8Sesaxe  * sharing the same parent.
76fb2f18f8Sesaxe  *
77fb2f18f8Sesaxe  * On NUMA systems, the CMT load balancing algorithm balances across the
78fb2f18f8Sesaxe  * CMT PGs within their respective lgroups. On UMA based system, there
79fb2f18f8Sesaxe  * exists a top level group of PGs to balance across. On NUMA systems multiple
80fb2f18f8Sesaxe  * top level groups are instantiated, where the top level balancing begins by
81fb2f18f8Sesaxe  * balancng across the CMT PGs within their respective (per lgroup) top level
82fb2f18f8Sesaxe  * groups.
83fb2f18f8Sesaxe  */
84a6604450Sesaxe static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
85a6604450Sesaxe static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
86a6604450Sesaxe 						/* used for null_proc_lpa */
87*0e751525SEric Saxe cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */
88fb2f18f8Sesaxe 
89a6604450Sesaxe static int		is_cpu0 = 1; /* true if this is boot CPU context */
90a6604450Sesaxe 
91a6604450Sesaxe /*
92*0e751525SEric Saxe  * Array of hardware sharing relationships that are blacklisted.
93*0e751525SEric Saxe  * PGs won't be instantiated for blacklisted hardware sharing relationships.
94*0e751525SEric Saxe  */
95*0e751525SEric Saxe static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
96*0e751525SEric Saxe 
97*0e751525SEric Saxe /*
98a6604450Sesaxe  * Set this to non-zero to disable CMT scheduling
99a6604450Sesaxe  * This must be done via kmdb -d, as /etc/system will be too late
100a6604450Sesaxe  */
101*0e751525SEric Saxe int			cmt_sched_disabled = 0;
102fb2f18f8Sesaxe 
103fb2f18f8Sesaxe static pg_cid_t		pg_cmt_class_id;		/* PG class id */
104fb2f18f8Sesaxe 
105fb2f18f8Sesaxe static pg_t		*pg_cmt_alloc();
106fb2f18f8Sesaxe static void		pg_cmt_free(pg_t *);
107fb2f18f8Sesaxe static void		pg_cmt_cpu_init(cpu_t *);
108fb2f18f8Sesaxe static void		pg_cmt_cpu_fini(cpu_t *);
109fb2f18f8Sesaxe static void		pg_cmt_cpu_active(cpu_t *);
110fb2f18f8Sesaxe static void		pg_cmt_cpu_inactive(cpu_t *);
111fb2f18f8Sesaxe static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
112fb2f18f8Sesaxe static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
113*0e751525SEric Saxe static char		*pg_cmt_policy_name(pg_t *);
114*0e751525SEric Saxe static void		pg_cmt_hier_sort(pg_cmt_t **, int);
115*0e751525SEric Saxe static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
116fb2f18f8Sesaxe static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
117fb2f18f8Sesaxe static int		pg_cmt_hw(pghw_type_t);
118fb2f18f8Sesaxe static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
119a6604450Sesaxe static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
120*0e751525SEric Saxe static int		pg_cmt_lineage_validate(pg_cmt_t **, int *);
121*0e751525SEric Saxe static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
122*0e751525SEric Saxe 			    kthread_t *, kthread_t *);
123*0e751525SEric Saxe static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
124*0e751525SEric Saxe 			    kthread_t *, kthread_t *);
125*0e751525SEric Saxe static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
126fb2f18f8Sesaxe 
127fb2f18f8Sesaxe /*
128fb2f18f8Sesaxe  * Macro to test if PG is managed by the CMT PG class
129fb2f18f8Sesaxe  */
130fb2f18f8Sesaxe #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
131fb2f18f8Sesaxe 
132fb2f18f8Sesaxe /*
133*0e751525SEric Saxe  * Status codes for CMT lineage validation
134*0e751525SEric Saxe  * See cmt_lineage_validate() below
135*0e751525SEric Saxe  */
136*0e751525SEric Saxe typedef enum cmt_lineage_validation {
137*0e751525SEric Saxe 	CMT_LINEAGE_VALID,
138*0e751525SEric Saxe 	CMT_LINEAGE_NON_CONCENTRIC,
139*0e751525SEric Saxe 	CMT_LINEAGE_REPAIRED,
140*0e751525SEric Saxe 	CMT_LINEAGE_UNRECOVERABLE
141*0e751525SEric Saxe } cmt_lineage_validation_t;
142*0e751525SEric Saxe 
143*0e751525SEric Saxe /*
144*0e751525SEric Saxe  * Status of the current lineage under construction.
145*0e751525SEric Saxe  * One must be holding cpu_lock to change this.
146*0e751525SEric Saxe  */
147*0e751525SEric Saxe static cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;
148*0e751525SEric Saxe 
149*0e751525SEric Saxe /*
150*0e751525SEric Saxe  * Power domain definitions (on x86) are defined by ACPI, and
151*0e751525SEric Saxe  * therefore may be subject to BIOS bugs.
152*0e751525SEric Saxe  */
153*0e751525SEric Saxe #define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)
154*0e751525SEric Saxe 
155*0e751525SEric Saxe /*
156fb2f18f8Sesaxe  * CMT PG ops
157fb2f18f8Sesaxe  */
158fb2f18f8Sesaxe struct pg_ops pg_ops_cmt = {
159fb2f18f8Sesaxe 	pg_cmt_alloc,
160fb2f18f8Sesaxe 	pg_cmt_free,
161fb2f18f8Sesaxe 	pg_cmt_cpu_init,
162fb2f18f8Sesaxe 	pg_cmt_cpu_fini,
163fb2f18f8Sesaxe 	pg_cmt_cpu_active,
164fb2f18f8Sesaxe 	pg_cmt_cpu_inactive,
165fb2f18f8Sesaxe 	pg_cmt_cpupart_in,
166fb2f18f8Sesaxe 	NULL,			/* cpupart_out */
167fb2f18f8Sesaxe 	pg_cmt_cpupart_move,
168fb2f18f8Sesaxe 	pg_cmt_cpu_belongs,
169*0e751525SEric Saxe 	pg_cmt_policy_name,
170fb2f18f8Sesaxe };
171fb2f18f8Sesaxe 
172fb2f18f8Sesaxe /*
173fb2f18f8Sesaxe  * Initialize the CMT PG class
174fb2f18f8Sesaxe  */
175fb2f18f8Sesaxe void
176fb2f18f8Sesaxe pg_cmt_class_init(void)
177fb2f18f8Sesaxe {
178fb2f18f8Sesaxe 	if (cmt_sched_disabled)
179fb2f18f8Sesaxe 		return;
180fb2f18f8Sesaxe 
181fb2f18f8Sesaxe 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
182fb2f18f8Sesaxe }
183fb2f18f8Sesaxe 
184fb2f18f8Sesaxe /*
185fb2f18f8Sesaxe  * Called to indicate a new CPU has started up so
186fb2f18f8Sesaxe  * that either t0 or the slave startup thread can
187fb2f18f8Sesaxe  * be accounted for.
188fb2f18f8Sesaxe  */
189fb2f18f8Sesaxe void
190fb2f18f8Sesaxe pg_cmt_cpu_startup(cpu_t *cp)
191fb2f18f8Sesaxe {
192*0e751525SEric Saxe 	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
193*0e751525SEric Saxe 	    cp->cpu_thread);
194fb2f18f8Sesaxe }
195fb2f18f8Sesaxe 
196fb2f18f8Sesaxe /*
197fb2f18f8Sesaxe  * Return non-zero if thread can migrate between "from" and "to"
198fb2f18f8Sesaxe  * without a performance penalty
199fb2f18f8Sesaxe  */
200fb2f18f8Sesaxe int
201fb2f18f8Sesaxe pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
202fb2f18f8Sesaxe {
203fb2f18f8Sesaxe 	if (from->cpu_physid->cpu_cacheid ==
204fb2f18f8Sesaxe 	    to->cpu_physid->cpu_cacheid)
205fb2f18f8Sesaxe 		return (1);
206fb2f18f8Sesaxe 	return (0);
207fb2f18f8Sesaxe }
208fb2f18f8Sesaxe 
209fb2f18f8Sesaxe /*
210fb2f18f8Sesaxe  * CMT class specific PG allocation
211fb2f18f8Sesaxe  */
212fb2f18f8Sesaxe static pg_t *
213fb2f18f8Sesaxe pg_cmt_alloc(void)
214fb2f18f8Sesaxe {
215fb2f18f8Sesaxe 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
216fb2f18f8Sesaxe }
217fb2f18f8Sesaxe 
218fb2f18f8Sesaxe /*
219fb2f18f8Sesaxe  * Class specific PG de-allocation
220fb2f18f8Sesaxe  */
221fb2f18f8Sesaxe static void
222fb2f18f8Sesaxe pg_cmt_free(pg_t *pg)
223fb2f18f8Sesaxe {
224fb2f18f8Sesaxe 	ASSERT(pg != NULL);
225fb2f18f8Sesaxe 	ASSERT(IS_CMT_PG(pg));
226fb2f18f8Sesaxe 
227fb2f18f8Sesaxe 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
228fb2f18f8Sesaxe }
229fb2f18f8Sesaxe 
230fb2f18f8Sesaxe /*
231*0e751525SEric Saxe  * Given a hardware sharing relationship, return which dispatcher
232*0e751525SEric Saxe  * policies should be implemented to optimize performance and efficiency
233fb2f18f8Sesaxe  */
234*0e751525SEric Saxe static pg_cmt_policy_t
235*0e751525SEric Saxe pg_cmt_policy(pghw_type_t hw)
236fb2f18f8Sesaxe {
237*0e751525SEric Saxe 	pg_cmt_policy_t p;
238*0e751525SEric Saxe 
239*0e751525SEric Saxe 	/*
240*0e751525SEric Saxe 	 * Give the platform a chance to override the default
241*0e751525SEric Saxe 	 */
242*0e751525SEric Saxe 	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
243*0e751525SEric Saxe 		return (p);
244*0e751525SEric Saxe 
245*0e751525SEric Saxe 	switch (hw) {
246*0e751525SEric Saxe 	case PGHW_IPIPE:
247*0e751525SEric Saxe 	case PGHW_FPU:
248*0e751525SEric Saxe 	case PGHW_CHIP:
249*0e751525SEric Saxe 		return (CMT_BALANCE);
250*0e751525SEric Saxe 	case PGHW_CACHE:
251*0e751525SEric Saxe 		return (CMT_AFFINITY);
252*0e751525SEric Saxe 	case PGHW_POW_ACTIVE:
253*0e751525SEric Saxe 	case PGHW_POW_IDLE:
254*0e751525SEric Saxe 		return (CMT_BALANCE);
255*0e751525SEric Saxe 	default:
256*0e751525SEric Saxe 		return (CMT_NO_POLICY);
257*0e751525SEric Saxe 	}
258*0e751525SEric Saxe }
259*0e751525SEric Saxe 
260*0e751525SEric Saxe /*
261*0e751525SEric Saxe  * Rank the importance of optimizing for the pg1 relationship vs.
262*0e751525SEric Saxe  * the pg2 relationship.
263*0e751525SEric Saxe  */
264*0e751525SEric Saxe static pg_cmt_t *
265*0e751525SEric Saxe pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
266*0e751525SEric Saxe {
267*0e751525SEric Saxe 	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
268*0e751525SEric Saxe 	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
269*0e751525SEric Saxe 
270*0e751525SEric Saxe 	/*
271*0e751525SEric Saxe 	 * A power domain is only important if CPUPM is enabled.
272*0e751525SEric Saxe 	 */
273*0e751525SEric Saxe 	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
274*0e751525SEric Saxe 		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
275*0e751525SEric Saxe 			return (pg2);
276*0e751525SEric Saxe 		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
277*0e751525SEric Saxe 			return (pg1);
278*0e751525SEric Saxe 	}
279*0e751525SEric Saxe 
280*0e751525SEric Saxe 	/*
281*0e751525SEric Saxe 	 * Otherwise, ask the platform
282*0e751525SEric Saxe 	 */
283*0e751525SEric Saxe 	if (pg_plat_hw_rank(hw1, hw2) == hw1)
284*0e751525SEric Saxe 		return (pg1);
285*0e751525SEric Saxe 	else
286*0e751525SEric Saxe 		return (pg2);
287*0e751525SEric Saxe }
288*0e751525SEric Saxe 
289*0e751525SEric Saxe /*
290*0e751525SEric Saxe  * Initialize CMT callbacks for the given PG
291*0e751525SEric Saxe  */
292*0e751525SEric Saxe static void
293*0e751525SEric Saxe cmt_callback_init(pg_t *pg)
294*0e751525SEric Saxe {
295*0e751525SEric Saxe 	switch (((pghw_t *)pg)->pghw_hw) {
296*0e751525SEric Saxe 	case PGHW_POW_ACTIVE:
297*0e751525SEric Saxe 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
298*0e751525SEric Saxe 		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
299*0e751525SEric Saxe 		break;
300*0e751525SEric Saxe 	default:
301*0e751525SEric Saxe 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
302*0e751525SEric Saxe 
303*0e751525SEric Saxe 	}
304*0e751525SEric Saxe }
305*0e751525SEric Saxe 
306*0e751525SEric Saxe /*
307*0e751525SEric Saxe  * Promote PG above it's current parent.
308*0e751525SEric Saxe  * This is only legal if PG has an equal or greater number of CPUs
309*0e751525SEric Saxe  * than it's parent.
310*0e751525SEric Saxe  */
311*0e751525SEric Saxe static void
312*0e751525SEric Saxe cmt_hier_promote(pg_cmt_t *pg)
313*0e751525SEric Saxe {
314*0e751525SEric Saxe 	pg_cmt_t	*parent;
315*0e751525SEric Saxe 	group_t		*children;
316*0e751525SEric Saxe 	cpu_t		*cpu;
317*0e751525SEric Saxe 	group_iter_t	iter;
318*0e751525SEric Saxe 	pg_cpu_itr_t	cpu_iter;
319*0e751525SEric Saxe 	int		r;
320*0e751525SEric Saxe 	int		err;
321*0e751525SEric Saxe 
322*0e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
323*0e751525SEric Saxe 
324*0e751525SEric Saxe 	parent = pg->cmt_parent;
325*0e751525SEric Saxe 	if (parent == NULL) {
326*0e751525SEric Saxe 		/*
327*0e751525SEric Saxe 		 * Nothing to do
328*0e751525SEric Saxe 		 */
329*0e751525SEric Saxe 		return;
330*0e751525SEric Saxe 	}
331*0e751525SEric Saxe 
332*0e751525SEric Saxe 	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
333*0e751525SEric Saxe 
334*0e751525SEric Saxe 	/*
335*0e751525SEric Saxe 	 * We're changing around the hierarchy, which is actively traversed
336*0e751525SEric Saxe 	 * by the dispatcher. Pause CPUS to ensure exclusivity.
337*0e751525SEric Saxe 	 */
338*0e751525SEric Saxe 	pause_cpus(NULL);
339*0e751525SEric Saxe 
340*0e751525SEric Saxe 	/*
341*0e751525SEric Saxe 	 * If necessary, update the parent's sibling set, replacing parent
342*0e751525SEric Saxe 	 * with PG.
343*0e751525SEric Saxe 	 */
344*0e751525SEric Saxe 	if (parent->cmt_siblings) {
345*0e751525SEric Saxe 		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
346*0e751525SEric Saxe 		    != -1) {
347*0e751525SEric Saxe 			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
348*0e751525SEric Saxe 			ASSERT(r != -1);
349*0e751525SEric Saxe 		}
350*0e751525SEric Saxe 	}
351*0e751525SEric Saxe 
352*0e751525SEric Saxe 	/*
353*0e751525SEric Saxe 	 * If the parent is at the top of the hierarchy, replace it's entry
354*0e751525SEric Saxe 	 * in the root lgroup's group of top level PGs.
355*0e751525SEric Saxe 	 */
356*0e751525SEric Saxe 	if (parent->cmt_parent == NULL &&
357*0e751525SEric Saxe 	    parent->cmt_siblings != &cmt_root->cl_pgs) {
358*0e751525SEric Saxe 		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
359*0e751525SEric Saxe 		    != -1) {
360*0e751525SEric Saxe 			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
361*0e751525SEric Saxe 			ASSERT(r != -1);
362*0e751525SEric Saxe 		}
363*0e751525SEric Saxe 	}
364*0e751525SEric Saxe 
365*0e751525SEric Saxe 	/*
366*0e751525SEric Saxe 	 * We assume (and therefore assert) that the PG being promoted is an
367*0e751525SEric Saxe 	 * only child of it's parent. Update the parent's children set
368*0e751525SEric Saxe 	 * replacing PG's entry with the parent (since the parent is becoming
369*0e751525SEric Saxe 	 * the child). Then have PG and the parent swap children sets.
370*0e751525SEric Saxe 	 */
371*0e751525SEric Saxe 	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
372*0e751525SEric Saxe 	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
373*0e751525SEric Saxe 		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
374*0e751525SEric Saxe 		ASSERT(r != -1);
375*0e751525SEric Saxe 	}
376*0e751525SEric Saxe 
377*0e751525SEric Saxe 	children = pg->cmt_children;
378*0e751525SEric Saxe 	pg->cmt_children = parent->cmt_children;
379*0e751525SEric Saxe 	parent->cmt_children = children;
380*0e751525SEric Saxe 
381*0e751525SEric Saxe 	/*
382*0e751525SEric Saxe 	 * Update the sibling references for PG and it's parent
383*0e751525SEric Saxe 	 */
384*0e751525SEric Saxe 	pg->cmt_siblings = parent->cmt_siblings;
385*0e751525SEric Saxe 	parent->cmt_siblings = pg->cmt_children;
386*0e751525SEric Saxe 
387*0e751525SEric Saxe 	/*
388*0e751525SEric Saxe 	 * Update any cached lineages in the per CPU pg data.
389*0e751525SEric Saxe 	 */
390*0e751525SEric Saxe 	PG_CPU_ITR_INIT(pg, cpu_iter);
391*0e751525SEric Saxe 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
392*0e751525SEric Saxe 		int		idx;
393*0e751525SEric Saxe 		group_t		*pgs;
394*0e751525SEric Saxe 		pg_cmt_t	*cpu_pg;
395*0e751525SEric Saxe 
396*0e751525SEric Saxe 		/*
397*0e751525SEric Saxe 		 * Iterate over the CPU's PGs updating the children
398*0e751525SEric Saxe 		 * of the PG being promoted, since they have a new parent.
399*0e751525SEric Saxe 		 */
400*0e751525SEric Saxe 		pgs = &cpu->cpu_pg->pgs;
401*0e751525SEric Saxe 		group_iter_init(&iter);
402*0e751525SEric Saxe 		while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) {
403*0e751525SEric Saxe 			if (cpu_pg->cmt_parent == pg) {
404*0e751525SEric Saxe 				cpu_pg->cmt_parent = parent;
405*0e751525SEric Saxe 			}
406*0e751525SEric Saxe 		}
407*0e751525SEric Saxe 
408*0e751525SEric Saxe 		/*
409*0e751525SEric Saxe 		 * Update the CMT load balancing lineage
410*0e751525SEric Saxe 		 */
411*0e751525SEric Saxe 		pgs = &cpu->cpu_pg->cmt_pgs;
412*0e751525SEric Saxe 		if ((idx = group_find(pgs, (void *)pg)) == -1) {
413*0e751525SEric Saxe 			/*
414*0e751525SEric Saxe 			 * Unless this is the CPU who's lineage is being
415*0e751525SEric Saxe 			 * constructed, the PG being promoted should be
416*0e751525SEric Saxe 			 * in the lineage.
417*0e751525SEric Saxe 			 */
418*0e751525SEric Saxe 			ASSERT(GROUP_SIZE(pgs) == 0);
419*0e751525SEric Saxe 			continue;
420*0e751525SEric Saxe 		}
421*0e751525SEric Saxe 
422*0e751525SEric Saxe 		ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent);
423*0e751525SEric Saxe 		ASSERT(idx > 0);
424*0e751525SEric Saxe 
425*0e751525SEric Saxe 		/*
426*0e751525SEric Saxe 		 * Have the child and the parent swap places in the CPU's
427*0e751525SEric Saxe 		 * lineage
428*0e751525SEric Saxe 		 */
429*0e751525SEric Saxe 		group_remove_at(pgs, idx);
430*0e751525SEric Saxe 		group_remove_at(pgs, idx - 1);
431*0e751525SEric Saxe 		err = group_add_at(pgs, parent, idx);
432*0e751525SEric Saxe 		ASSERT(err == 0);
433*0e751525SEric Saxe 		err = group_add_at(pgs, pg, idx - 1);
434*0e751525SEric Saxe 		ASSERT(err == 0);
435*0e751525SEric Saxe 	}
436*0e751525SEric Saxe 
437*0e751525SEric Saxe 	/*
438*0e751525SEric Saxe 	 * Update the parent references for PG and it's parent
439*0e751525SEric Saxe 	 */
440*0e751525SEric Saxe 	pg->cmt_parent = parent->cmt_parent;
441*0e751525SEric Saxe 	parent->cmt_parent = pg;
442*0e751525SEric Saxe 
443*0e751525SEric Saxe 	start_cpus();
444fb2f18f8Sesaxe }
445fb2f18f8Sesaxe 
446fb2f18f8Sesaxe /*
447fb2f18f8Sesaxe  * CMT class callback for a new CPU entering the system
448fb2f18f8Sesaxe  */
449fb2f18f8Sesaxe static void
450fb2f18f8Sesaxe pg_cmt_cpu_init(cpu_t *cp)
451fb2f18f8Sesaxe {
452fb2f18f8Sesaxe 	pg_cmt_t	*pg;
453fb2f18f8Sesaxe 	group_t		*cmt_pgs;
454*0e751525SEric Saxe 	int		levels, level;
455fb2f18f8Sesaxe 	pghw_type_t	hw;
456fb2f18f8Sesaxe 	pg_t		*pg_cache = NULL;
457fb2f18f8Sesaxe 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
458fb2f18f8Sesaxe 	lgrp_handle_t	lgrp_handle;
459fb2f18f8Sesaxe 	cmt_lgrp_t	*lgrp;
460fb2f18f8Sesaxe 
461fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
462fb2f18f8Sesaxe 
463*0e751525SEric Saxe 	if (cmt_sched_disabled)
464*0e751525SEric Saxe 		return;
465*0e751525SEric Saxe 
466fb2f18f8Sesaxe 	/*
467fb2f18f8Sesaxe 	 * A new CPU is coming into the system.
468fb2f18f8Sesaxe 	 * Interrogate the platform to see if the CPU
469*0e751525SEric Saxe 	 * has any performance or efficiency relevant
470*0e751525SEric Saxe 	 * sharing relationships
471fb2f18f8Sesaxe 	 */
472fb2f18f8Sesaxe 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
473fb2f18f8Sesaxe 	cp->cpu_pg->cmt_lineage = NULL;
474fb2f18f8Sesaxe 
475fb2f18f8Sesaxe 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
476*0e751525SEric Saxe 	levels = 0;
477fb2f18f8Sesaxe 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
478fb2f18f8Sesaxe 
479*0e751525SEric Saxe 		pg_cmt_policy_t	policy;
480*0e751525SEric Saxe 
481fb2f18f8Sesaxe 		/*
482*0e751525SEric Saxe 		 * We're only interested in the hw sharing relationships
483*0e751525SEric Saxe 		 * for which we know how to optimize.
484fb2f18f8Sesaxe 		 */
485*0e751525SEric Saxe 		policy = pg_cmt_policy(hw);
486*0e751525SEric Saxe 		if (policy == CMT_NO_POLICY ||
487*0e751525SEric Saxe 		    pg_plat_hw_shared(cp, hw) == 0)
488fb2f18f8Sesaxe 			continue;
489fb2f18f8Sesaxe 
490fb2f18f8Sesaxe 		/*
491*0e751525SEric Saxe 		 * Continue if the hardware sharing relationship has been
492*0e751525SEric Saxe 		 * blacklisted.
493*0e751525SEric Saxe 		 */
494*0e751525SEric Saxe 		if (cmt_hw_blacklisted[hw]) {
495*0e751525SEric Saxe 			continue;
496*0e751525SEric Saxe 		}
497*0e751525SEric Saxe 
498*0e751525SEric Saxe 		/*
499fb2f18f8Sesaxe 		 * Find (or create) the PG associated with
500fb2f18f8Sesaxe 		 * the hw sharing relationship in which cp
501fb2f18f8Sesaxe 		 * belongs.
502fb2f18f8Sesaxe 		 *
503fb2f18f8Sesaxe 		 * Determine if a suitable PG already
504fb2f18f8Sesaxe 		 * exists, or if one needs to be created.
505fb2f18f8Sesaxe 		 */
506fb2f18f8Sesaxe 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
507fb2f18f8Sesaxe 		if (pg == NULL) {
508fb2f18f8Sesaxe 			/*
509fb2f18f8Sesaxe 			 * Create a new one.
510fb2f18f8Sesaxe 			 * Initialize the common...
511fb2f18f8Sesaxe 			 */
512fb2f18f8Sesaxe 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
513fb2f18f8Sesaxe 
514fb2f18f8Sesaxe 			/* ... physical ... */
515fb2f18f8Sesaxe 			pghw_init((pghw_t *)pg, cp, hw);
516fb2f18f8Sesaxe 
517fb2f18f8Sesaxe 			/*
518fb2f18f8Sesaxe 			 * ... and CMT specific portions of the
519fb2f18f8Sesaxe 			 * structure.
520fb2f18f8Sesaxe 			 */
521*0e751525SEric Saxe 			pg->cmt_policy = policy;
522*0e751525SEric Saxe 
523*0e751525SEric Saxe 			/* CMT event callbacks */
524*0e751525SEric Saxe 			cmt_callback_init((pg_t *)pg);
525*0e751525SEric Saxe 
526fb2f18f8Sesaxe 			bitset_init(&pg->cmt_cpus_actv_set);
527fb2f18f8Sesaxe 			group_create(&pg->cmt_cpus_actv);
528fb2f18f8Sesaxe 		} else {
529fb2f18f8Sesaxe 			ASSERT(IS_CMT_PG(pg));
530fb2f18f8Sesaxe 		}
531fb2f18f8Sesaxe 
532fb2f18f8Sesaxe 		/* Add the CPU to the PG */
533fb2f18f8Sesaxe 		pg_cpu_add((pg_t *)pg, cp);
534fb2f18f8Sesaxe 
535fb2f18f8Sesaxe 		/*
5366890d023SEric Saxe 		 * Ensure capacity of the active CPU group/bitset
537fb2f18f8Sesaxe 		 */
538fb2f18f8Sesaxe 		group_expand(&pg->cmt_cpus_actv,
539fb2f18f8Sesaxe 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
540fb2f18f8Sesaxe 
541fb2f18f8Sesaxe 		if (cp->cpu_seqid >=
542fb2f18f8Sesaxe 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
543fb2f18f8Sesaxe 			bitset_resize(&pg->cmt_cpus_actv_set,
544fb2f18f8Sesaxe 			    cp->cpu_seqid + 1);
545fb2f18f8Sesaxe 		}
546fb2f18f8Sesaxe 
547fb2f18f8Sesaxe 		/*
548*0e751525SEric Saxe 		 * Build a lineage of CMT PGs for load balancing / coalescence
549fb2f18f8Sesaxe 		 */
550*0e751525SEric Saxe 		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
551*0e751525SEric Saxe 			cpu_cmt_hier[levels++] = pg;
552fb2f18f8Sesaxe 		}
553fb2f18f8Sesaxe 
554fb2f18f8Sesaxe 		/* Cache this for later */
555fb2f18f8Sesaxe 		if (hw == PGHW_CACHE)
556fb2f18f8Sesaxe 			pg_cache = (pg_t *)pg;
557fb2f18f8Sesaxe 	}
558fb2f18f8Sesaxe 
559*0e751525SEric Saxe 	group_expand(cmt_pgs, levels);
5606890d023SEric Saxe 
5616890d023SEric Saxe 	if (cmt_root == NULL)
5626890d023SEric Saxe 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
563fb2f18f8Sesaxe 
564fb2f18f8Sesaxe 	/*
565*0e751525SEric Saxe 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
5666890d023SEric Saxe 	 */
5676890d023SEric Saxe 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
5686890d023SEric Saxe 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
5696890d023SEric Saxe 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
5706890d023SEric Saxe 
5716890d023SEric Saxe 	/*
572*0e751525SEric Saxe 	 * Ascendingly sort the PGs in the lineage by number of CPUs
573*0e751525SEric Saxe 	 */
574*0e751525SEric Saxe 	pg_cmt_hier_sort(cpu_cmt_hier, levels);
575*0e751525SEric Saxe 
576*0e751525SEric Saxe 	/*
577*0e751525SEric Saxe 	 * Examine the lineage and validate it.
578*0e751525SEric Saxe 	 * This routine will also try to fix the lineage along with the
579*0e751525SEric Saxe 	 * rest of the PG hierarchy should it detect an issue.
580*0e751525SEric Saxe 	 *
581*0e751525SEric Saxe 	 * If it returns -1, an unrecoverable error has happened and we
582*0e751525SEric Saxe 	 * need to return.
583*0e751525SEric Saxe 	 */
584*0e751525SEric Saxe 	if (pg_cmt_lineage_validate(cpu_cmt_hier, &levels) < 0)
585*0e751525SEric Saxe 		return;
586*0e751525SEric Saxe 
587*0e751525SEric Saxe 	/*
588*0e751525SEric Saxe 	 * For existing PGs in the lineage, verify that the parent is
589*0e751525SEric Saxe 	 * correct, as the generation in the lineage may have changed
590*0e751525SEric Saxe 	 * as a result of the sorting. Start the traversal at the top
591*0e751525SEric Saxe 	 * of the lineage, moving down.
592*0e751525SEric Saxe 	 */
593*0e751525SEric Saxe 	for (level = levels - 1; level >= 0; ) {
594*0e751525SEric Saxe 		int reorg;
595*0e751525SEric Saxe 
596*0e751525SEric Saxe 		reorg = 0;
597*0e751525SEric Saxe 		pg = cpu_cmt_hier[level];
598*0e751525SEric Saxe 
599*0e751525SEric Saxe 		/*
600*0e751525SEric Saxe 		 * Promote PGs at an incorrect generation into place.
601*0e751525SEric Saxe 		 */
602*0e751525SEric Saxe 		while (pg->cmt_parent &&
603*0e751525SEric Saxe 		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
604*0e751525SEric Saxe 			cmt_hier_promote(pg);
605*0e751525SEric Saxe 			reorg++;
606*0e751525SEric Saxe 		}
607*0e751525SEric Saxe 		if (reorg > 0)
608*0e751525SEric Saxe 			level = levels - 1;
609*0e751525SEric Saxe 		else
610*0e751525SEric Saxe 			level--;
611*0e751525SEric Saxe 	}
612*0e751525SEric Saxe 
613*0e751525SEric Saxe 	/*
6146890d023SEric Saxe 	 * For each of the PGs in the CPU's lineage:
615*0e751525SEric Saxe 	 *	- Add an entry in the CPU sorted CMT PG group
616*0e751525SEric Saxe 	 *	  which is used for top down CMT load balancing
617fb2f18f8Sesaxe 	 *	- Tie the PG into the CMT hierarchy by connecting
618fb2f18f8Sesaxe 	 *	  it to it's parent and siblings.
619fb2f18f8Sesaxe 	 */
620*0e751525SEric Saxe 	for (level = 0; level < levels; level++) {
621fb2f18f8Sesaxe 		uint_t		children;
622fb2f18f8Sesaxe 		int		err;
623fb2f18f8Sesaxe 
624fb2f18f8Sesaxe 		pg = cpu_cmt_hier[level];
625*0e751525SEric Saxe 		err = group_add_at(cmt_pgs, pg, levels - level - 1);
626fb2f18f8Sesaxe 		ASSERT(err == 0);
627fb2f18f8Sesaxe 
628fb2f18f8Sesaxe 		if (level == 0)
629fb2f18f8Sesaxe 			cp->cpu_pg->cmt_lineage = (pg_t *)pg;
630fb2f18f8Sesaxe 
631fb2f18f8Sesaxe 		if (pg->cmt_siblings != NULL) {
632fb2f18f8Sesaxe 			/* Already initialized */
633fb2f18f8Sesaxe 			ASSERT(pg->cmt_parent == NULL ||
634fb2f18f8Sesaxe 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
635fb2f18f8Sesaxe 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
636c416da2dSjb145095 			    ((pg->cmt_parent != NULL) &&
637c416da2dSjb145095 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
638fb2f18f8Sesaxe 			continue;
639fb2f18f8Sesaxe 		}
640fb2f18f8Sesaxe 
641*0e751525SEric Saxe 		if ((level + 1) == levels) {
642fb2f18f8Sesaxe 			pg->cmt_parent = NULL;
6436890d023SEric Saxe 
644fb2f18f8Sesaxe 			pg->cmt_siblings = &lgrp->cl_pgs;
645fb2f18f8Sesaxe 			children = ++lgrp->cl_npgs;
646*0e751525SEric Saxe 			if (cmt_root != lgrp)
6476890d023SEric Saxe 				cmt_root->cl_npgs++;
648fb2f18f8Sesaxe 		} else {
649fb2f18f8Sesaxe 			pg->cmt_parent = cpu_cmt_hier[level + 1];
650fb2f18f8Sesaxe 
651fb2f18f8Sesaxe 			/*
652fb2f18f8Sesaxe 			 * A good parent keeps track of their children.
653fb2f18f8Sesaxe 			 * The parent's children group is also the PG's
654fb2f18f8Sesaxe 			 * siblings.
655fb2f18f8Sesaxe 			 */
656fb2f18f8Sesaxe 			if (pg->cmt_parent->cmt_children == NULL) {
657fb2f18f8Sesaxe 				pg->cmt_parent->cmt_children =
658fb2f18f8Sesaxe 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
659fb2f18f8Sesaxe 				group_create(pg->cmt_parent->cmt_children);
660fb2f18f8Sesaxe 			}
661fb2f18f8Sesaxe 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
662fb2f18f8Sesaxe 			children = ++pg->cmt_parent->cmt_nchildren;
663fb2f18f8Sesaxe 		}
6646890d023SEric Saxe 
665fb2f18f8Sesaxe 		group_expand(pg->cmt_siblings, children);
6666890d023SEric Saxe 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
667fb2f18f8Sesaxe 	}
668fb2f18f8Sesaxe 
669fb2f18f8Sesaxe 	/*
670fb2f18f8Sesaxe 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
671fb2f18f8Sesaxe 	 * for fast lookups later.
672fb2f18f8Sesaxe 	 */
673fb2f18f8Sesaxe 	if (cp->cpu_physid) {
674fb2f18f8Sesaxe 		cp->cpu_physid->cpu_chipid =
675fb2f18f8Sesaxe 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
676fb2f18f8Sesaxe 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
677fb2f18f8Sesaxe 
678fb2f18f8Sesaxe 		/*
679fb2f18f8Sesaxe 		 * If this cpu has a PG representing shared cache, then set
680fb2f18f8Sesaxe 		 * cpu_cacheid to that PG's logical id
681fb2f18f8Sesaxe 		 */
682fb2f18f8Sesaxe 		if (pg_cache)
683fb2f18f8Sesaxe 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
684fb2f18f8Sesaxe 	}
685fb2f18f8Sesaxe 
686fb2f18f8Sesaxe 	/* CPU0 only initialization */
687fb2f18f8Sesaxe 	if (is_cpu0) {
688fb2f18f8Sesaxe 		pg_cmt_cpu_startup(cp);
689fb2f18f8Sesaxe 		is_cpu0 = 0;
690a6604450Sesaxe 		cpu0_lgrp = lgrp;
691fb2f18f8Sesaxe 	}
692fb2f18f8Sesaxe 
693fb2f18f8Sesaxe }
694fb2f18f8Sesaxe 
695fb2f18f8Sesaxe /*
696fb2f18f8Sesaxe  * Class callback when a CPU is leaving the system (deletion)
697fb2f18f8Sesaxe  */
698fb2f18f8Sesaxe static void
699fb2f18f8Sesaxe pg_cmt_cpu_fini(cpu_t *cp)
700fb2f18f8Sesaxe {
701fb2f18f8Sesaxe 	group_iter_t	i;
702fb2f18f8Sesaxe 	pg_cmt_t	*pg;
703fb2f18f8Sesaxe 	group_t		*pgs, *cmt_pgs;
704fb2f18f8Sesaxe 	lgrp_handle_t	lgrp_handle;
705fb2f18f8Sesaxe 	cmt_lgrp_t	*lgrp;
706fb2f18f8Sesaxe 
707*0e751525SEric Saxe 	if (cmt_sched_disabled)
708*0e751525SEric Saxe 		return;
709*0e751525SEric Saxe 
710fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
711fb2f18f8Sesaxe 	cmt_pgs = &cp->cpu_pg->cmt_pgs;
712fb2f18f8Sesaxe 
713fb2f18f8Sesaxe 	/*
714fb2f18f8Sesaxe 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
715fb2f18f8Sesaxe 	 */
716fb2f18f8Sesaxe 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
717a6604450Sesaxe 
718fb2f18f8Sesaxe 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
7193e81cacfSEric Saxe 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
720a6604450Sesaxe 		/*
7213e81cacfSEric Saxe 		 * One might wonder how we could be deconfiguring the
7223e81cacfSEric Saxe 		 * only CPU in the system.
723a6604450Sesaxe 		 *
7243e81cacfSEric Saxe 		 * On Starcat systems when null_proc_lpa is detected,
7253e81cacfSEric Saxe 		 * the boot CPU (which is already configured into a leaf
7263e81cacfSEric Saxe 		 * lgroup), is moved into the root lgroup. This is done by
7273e81cacfSEric Saxe 		 * deconfiguring it from both lgroups and processor
7283e81cacfSEric Saxe 		 * groups), and then later reconfiguring it back in.  This
7293e81cacfSEric Saxe 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
7303e81cacfSEric Saxe 		 *
7313e81cacfSEric Saxe 		 * This special case is detected by noting that the platform
7323e81cacfSEric Saxe 		 * has changed the CPU's lgrp affiliation (since it now
7333e81cacfSEric Saxe 		 * belongs in the root). In this case, use the cmt_lgrp_t
7343e81cacfSEric Saxe 		 * cached for the boot CPU, since this is what needs to be
7353e81cacfSEric Saxe 		 * torn down.
736a6604450Sesaxe 		 */
737a6604450Sesaxe 		lgrp = cpu0_lgrp;
738a6604450Sesaxe 	}
739fb2f18f8Sesaxe 
7403e81cacfSEric Saxe 	ASSERT(lgrp != NULL);
7413e81cacfSEric Saxe 
742fb2f18f8Sesaxe 	/*
743fb2f18f8Sesaxe 	 * First, clean up anything load balancing specific for each of
744fb2f18f8Sesaxe 	 * the CPU's PGs that participated in CMT load balancing
745fb2f18f8Sesaxe 	 */
746fb2f18f8Sesaxe 	pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage;
747fb2f18f8Sesaxe 	while (pg != NULL) {
748fb2f18f8Sesaxe 
749fb2f18f8Sesaxe 		/*
750fb2f18f8Sesaxe 		 * Remove the PG from the CPU's load balancing lineage
751fb2f18f8Sesaxe 		 */
752fb2f18f8Sesaxe 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
753fb2f18f8Sesaxe 
754fb2f18f8Sesaxe 		/*
755fb2f18f8Sesaxe 		 * If it's about to become empty, destroy it's children
756fb2f18f8Sesaxe 		 * group, and remove it's reference from it's siblings.
757fb2f18f8Sesaxe 		 * This is done here (rather than below) to avoid removing
758fb2f18f8Sesaxe 		 * our reference from a PG that we just eliminated.
759fb2f18f8Sesaxe 		 */
760fb2f18f8Sesaxe 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
761fb2f18f8Sesaxe 			if (pg->cmt_children != NULL)
762fb2f18f8Sesaxe 				group_destroy(pg->cmt_children);
763fb2f18f8Sesaxe 			if (pg->cmt_siblings != NULL) {
764fb2f18f8Sesaxe 				if (pg->cmt_siblings == &lgrp->cl_pgs)
765fb2f18f8Sesaxe 					lgrp->cl_npgs--;
766fb2f18f8Sesaxe 				else
767fb2f18f8Sesaxe 					pg->cmt_parent->cmt_nchildren--;
768fb2f18f8Sesaxe 			}
769fb2f18f8Sesaxe 		}
770fb2f18f8Sesaxe 		pg = pg->cmt_parent;
771fb2f18f8Sesaxe 	}
772fb2f18f8Sesaxe 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
773fb2f18f8Sesaxe 
774fb2f18f8Sesaxe 	/*
775fb2f18f8Sesaxe 	 * Now that the load balancing lineage updates have happened,
776fb2f18f8Sesaxe 	 * remove the CPU from all it's PGs (destroying any that become
777fb2f18f8Sesaxe 	 * empty).
778fb2f18f8Sesaxe 	 */
779fb2f18f8Sesaxe 	group_iter_init(&i);
780fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
781fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
782fb2f18f8Sesaxe 			continue;
783fb2f18f8Sesaxe 
784fb2f18f8Sesaxe 		pg_cpu_delete((pg_t *)pg, cp);
785fb2f18f8Sesaxe 		/*
786fb2f18f8Sesaxe 		 * Deleting the CPU from the PG changes the CPU's
787fb2f18f8Sesaxe 		 * PG group over which we are actively iterating
788fb2f18f8Sesaxe 		 * Re-initialize the iteration
789fb2f18f8Sesaxe 		 */
790fb2f18f8Sesaxe 		group_iter_init(&i);
791fb2f18f8Sesaxe 
792fb2f18f8Sesaxe 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
793fb2f18f8Sesaxe 
794fb2f18f8Sesaxe 			/*
795fb2f18f8Sesaxe 			 * The PG has become zero sized, so destroy it.
796fb2f18f8Sesaxe 			 */
797fb2f18f8Sesaxe 			group_destroy(&pg->cmt_cpus_actv);
798fb2f18f8Sesaxe 			bitset_fini(&pg->cmt_cpus_actv_set);
799fb2f18f8Sesaxe 			pghw_fini((pghw_t *)pg);
800fb2f18f8Sesaxe 
801fb2f18f8Sesaxe 			pg_destroy((pg_t *)pg);
802fb2f18f8Sesaxe 		}
803fb2f18f8Sesaxe 	}
804fb2f18f8Sesaxe }
805fb2f18f8Sesaxe 
806fb2f18f8Sesaxe /*
807fb2f18f8Sesaxe  * Class callback when a CPU is entering a cpu partition
808fb2f18f8Sesaxe  */
809fb2f18f8Sesaxe static void
810fb2f18f8Sesaxe pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
811fb2f18f8Sesaxe {
812fb2f18f8Sesaxe 	group_t		*pgs;
813fb2f18f8Sesaxe 	pg_t		*pg;
814fb2f18f8Sesaxe 	group_iter_t	i;
815fb2f18f8Sesaxe 
816fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
817fb2f18f8Sesaxe 
818*0e751525SEric Saxe 	if (cmt_sched_disabled)
819*0e751525SEric Saxe 		return;
820*0e751525SEric Saxe 
821fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
822fb2f18f8Sesaxe 
823fb2f18f8Sesaxe 	/*
824fb2f18f8Sesaxe 	 * Ensure that the new partition's PG bitset
825fb2f18f8Sesaxe 	 * is large enough for all CMT PG's to which cp
826fb2f18f8Sesaxe 	 * belongs
827fb2f18f8Sesaxe 	 */
828fb2f18f8Sesaxe 	group_iter_init(&i);
829fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
830fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
831fb2f18f8Sesaxe 			continue;
832fb2f18f8Sesaxe 
833fb2f18f8Sesaxe 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
834fb2f18f8Sesaxe 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
835fb2f18f8Sesaxe 	}
836fb2f18f8Sesaxe }
837fb2f18f8Sesaxe 
838fb2f18f8Sesaxe /*
839fb2f18f8Sesaxe  * Class callback when a CPU is actually moving partitions
840fb2f18f8Sesaxe  */
841fb2f18f8Sesaxe static void
842fb2f18f8Sesaxe pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
843fb2f18f8Sesaxe {
844fb2f18f8Sesaxe 	cpu_t		*cpp;
845fb2f18f8Sesaxe 	group_t		*pgs;
846fb2f18f8Sesaxe 	pg_t		*pg;
847fb2f18f8Sesaxe 	group_iter_t	pg_iter;
848fb2f18f8Sesaxe 	pg_cpu_itr_t	cpu_iter;
849fb2f18f8Sesaxe 	boolean_t	found;
850fb2f18f8Sesaxe 
851fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
852fb2f18f8Sesaxe 
853*0e751525SEric Saxe 	if (cmt_sched_disabled)
854*0e751525SEric Saxe 		return;
855*0e751525SEric Saxe 
856fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
857fb2f18f8Sesaxe 	group_iter_init(&pg_iter);
858fb2f18f8Sesaxe 
859fb2f18f8Sesaxe 	/*
860fb2f18f8Sesaxe 	 * Iterate over the CPUs CMT PGs
861fb2f18f8Sesaxe 	 */
862fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
863fb2f18f8Sesaxe 
864fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
865fb2f18f8Sesaxe 			continue;
866fb2f18f8Sesaxe 
867fb2f18f8Sesaxe 		/*
868fb2f18f8Sesaxe 		 * Add the PG to the bitset in the new partition.
869fb2f18f8Sesaxe 		 */
870fb2f18f8Sesaxe 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
871fb2f18f8Sesaxe 
872fb2f18f8Sesaxe 		/*
873fb2f18f8Sesaxe 		 * Remove the PG from the bitset in the old partition
874fb2f18f8Sesaxe 		 * if the last of the PG's CPUs have left.
875fb2f18f8Sesaxe 		 */
876fb2f18f8Sesaxe 		found = B_FALSE;
877fb2f18f8Sesaxe 		PG_CPU_ITR_INIT(pg, cpu_iter);
878fb2f18f8Sesaxe 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
879fb2f18f8Sesaxe 			if (cpp == cp)
880fb2f18f8Sesaxe 				continue;
881a6604450Sesaxe 			if (CPU_ACTIVE(cpp) &&
882a6604450Sesaxe 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
883fb2f18f8Sesaxe 				found = B_TRUE;
884fb2f18f8Sesaxe 				break;
885fb2f18f8Sesaxe 			}
886fb2f18f8Sesaxe 		}
887fb2f18f8Sesaxe 		if (!found)
888fb2f18f8Sesaxe 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
889fb2f18f8Sesaxe 	}
890fb2f18f8Sesaxe }
891fb2f18f8Sesaxe 
892fb2f18f8Sesaxe /*
893fb2f18f8Sesaxe  * Class callback when a CPU becomes active (online)
894fb2f18f8Sesaxe  *
895fb2f18f8Sesaxe  * This is called in a context where CPUs are paused
896fb2f18f8Sesaxe  */
897fb2f18f8Sesaxe static void
898fb2f18f8Sesaxe pg_cmt_cpu_active(cpu_t *cp)
899fb2f18f8Sesaxe {
900fb2f18f8Sesaxe 	int		err;
901fb2f18f8Sesaxe 	group_iter_t	i;
902fb2f18f8Sesaxe 	pg_cmt_t	*pg;
903fb2f18f8Sesaxe 	group_t		*pgs;
904fb2f18f8Sesaxe 
905fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
906fb2f18f8Sesaxe 
907*0e751525SEric Saxe 	if (cmt_sched_disabled)
908*0e751525SEric Saxe 		return;
909*0e751525SEric Saxe 
910fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
911fb2f18f8Sesaxe 	group_iter_init(&i);
912fb2f18f8Sesaxe 
913fb2f18f8Sesaxe 	/*
914fb2f18f8Sesaxe 	 * Iterate over the CPU's PGs
915fb2f18f8Sesaxe 	 */
916fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
917fb2f18f8Sesaxe 
918fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
919fb2f18f8Sesaxe 			continue;
920fb2f18f8Sesaxe 
921fb2f18f8Sesaxe 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
922fb2f18f8Sesaxe 		ASSERT(err == 0);
923fb2f18f8Sesaxe 
924fb2f18f8Sesaxe 		/*
925fb2f18f8Sesaxe 		 * If this is the first active CPU in the PG, and it
926fb2f18f8Sesaxe 		 * represents a hardware sharing relationship over which
927fb2f18f8Sesaxe 		 * CMT load balancing is performed, add it as a candidate
928fb2f18f8Sesaxe 		 * for balancing with it's siblings.
929fb2f18f8Sesaxe 		 */
930fb2f18f8Sesaxe 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
931*0e751525SEric Saxe 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
932fb2f18f8Sesaxe 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
933fb2f18f8Sesaxe 			ASSERT(err == 0);
9346890d023SEric Saxe 
9356890d023SEric Saxe 			/*
9366890d023SEric Saxe 			 * If this is a top level PG, add it as a balancing
937*0e751525SEric Saxe 			 * candidate when balancing within the root lgroup.
9386890d023SEric Saxe 			 */
939*0e751525SEric Saxe 			if (pg->cmt_parent == NULL &&
940*0e751525SEric Saxe 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
9416890d023SEric Saxe 				err = group_add(&cmt_root->cl_pgs, pg,
9426890d023SEric Saxe 				    GRP_NORESIZE);
9436890d023SEric Saxe 				ASSERT(err == 0);
9446890d023SEric Saxe 			}
945fb2f18f8Sesaxe 		}
946fb2f18f8Sesaxe 
947fb2f18f8Sesaxe 		/*
948fb2f18f8Sesaxe 		 * Notate the CPU in the PGs active CPU bitset.
949fb2f18f8Sesaxe 		 * Also notate the PG as being active in it's associated
950fb2f18f8Sesaxe 		 * partition
951fb2f18f8Sesaxe 		 */
952fb2f18f8Sesaxe 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
953fb2f18f8Sesaxe 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
954fb2f18f8Sesaxe 	}
955fb2f18f8Sesaxe }
956fb2f18f8Sesaxe 
957fb2f18f8Sesaxe /*
958fb2f18f8Sesaxe  * Class callback when a CPU goes inactive (offline)
959fb2f18f8Sesaxe  *
960fb2f18f8Sesaxe  * This is called in a context where CPUs are paused
961fb2f18f8Sesaxe  */
962fb2f18f8Sesaxe static void
963fb2f18f8Sesaxe pg_cmt_cpu_inactive(cpu_t *cp)
964fb2f18f8Sesaxe {
965fb2f18f8Sesaxe 	int		err;
966fb2f18f8Sesaxe 	group_t		*pgs;
967fb2f18f8Sesaxe 	pg_cmt_t	*pg;
968fb2f18f8Sesaxe 	cpu_t		*cpp;
969fb2f18f8Sesaxe 	group_iter_t	i;
970fb2f18f8Sesaxe 	pg_cpu_itr_t	cpu_itr;
971fb2f18f8Sesaxe 	boolean_t	found;
972fb2f18f8Sesaxe 
973fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
974fb2f18f8Sesaxe 
975*0e751525SEric Saxe 	if (cmt_sched_disabled)
976*0e751525SEric Saxe 		return;
977*0e751525SEric Saxe 
978fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
979fb2f18f8Sesaxe 	group_iter_init(&i);
980fb2f18f8Sesaxe 
981fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
982fb2f18f8Sesaxe 
983fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
984fb2f18f8Sesaxe 			continue;
985fb2f18f8Sesaxe 
986fb2f18f8Sesaxe 		/*
987fb2f18f8Sesaxe 		 * Remove the CPU from the CMT PGs active CPU group
988fb2f18f8Sesaxe 		 * bitmap
989fb2f18f8Sesaxe 		 */
990fb2f18f8Sesaxe 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
991fb2f18f8Sesaxe 		ASSERT(err == 0);
992fb2f18f8Sesaxe 
993fb2f18f8Sesaxe 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
994fb2f18f8Sesaxe 
995fb2f18f8Sesaxe 		/*
996fb2f18f8Sesaxe 		 * If there are no more active CPUs in this PG over which
997fb2f18f8Sesaxe 		 * load was balanced, remove it as a balancing candidate.
998fb2f18f8Sesaxe 		 */
999fb2f18f8Sesaxe 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
1000*0e751525SEric Saxe 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1001fb2f18f8Sesaxe 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1002fb2f18f8Sesaxe 			ASSERT(err == 0);
10036890d023SEric Saxe 
1004*0e751525SEric Saxe 			if (pg->cmt_parent == NULL &&
1005*0e751525SEric Saxe 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
10066890d023SEric Saxe 				err = group_remove(&cmt_root->cl_pgs, pg,
10076890d023SEric Saxe 				    GRP_NORESIZE);
10086890d023SEric Saxe 				ASSERT(err == 0);
10096890d023SEric Saxe 			}
1010fb2f18f8Sesaxe 		}
1011fb2f18f8Sesaxe 
1012fb2f18f8Sesaxe 		/*
1013fb2f18f8Sesaxe 		 * Assert the number of active CPUs does not exceed
1014fb2f18f8Sesaxe 		 * the total number of CPUs in the PG
1015fb2f18f8Sesaxe 		 */
1016fb2f18f8Sesaxe 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1017fb2f18f8Sesaxe 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1018fb2f18f8Sesaxe 
1019fb2f18f8Sesaxe 		/*
1020fb2f18f8Sesaxe 		 * Update the PG bitset in the CPU's old partition
1021fb2f18f8Sesaxe 		 */
1022fb2f18f8Sesaxe 		found = B_FALSE;
1023fb2f18f8Sesaxe 		PG_CPU_ITR_INIT(pg, cpu_itr);
1024fb2f18f8Sesaxe 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1025fb2f18f8Sesaxe 			if (cpp == cp)
1026fb2f18f8Sesaxe 				continue;
1027a6604450Sesaxe 			if (CPU_ACTIVE(cpp) &&
1028a6604450Sesaxe 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1029fb2f18f8Sesaxe 				found = B_TRUE;
1030fb2f18f8Sesaxe 				break;
1031fb2f18f8Sesaxe 			}
1032fb2f18f8Sesaxe 		}
1033fb2f18f8Sesaxe 		if (!found) {
1034fb2f18f8Sesaxe 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
1035fb2f18f8Sesaxe 			    ((pg_t *)pg)->pg_id);
1036fb2f18f8Sesaxe 		}
1037fb2f18f8Sesaxe 	}
1038fb2f18f8Sesaxe }
1039fb2f18f8Sesaxe 
1040fb2f18f8Sesaxe /*
1041fb2f18f8Sesaxe  * Return non-zero if the CPU belongs in the given PG
1042fb2f18f8Sesaxe  */
1043fb2f18f8Sesaxe static int
1044fb2f18f8Sesaxe pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1045fb2f18f8Sesaxe {
1046fb2f18f8Sesaxe 	cpu_t	*pg_cpu;
1047fb2f18f8Sesaxe 
1048fb2f18f8Sesaxe 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1049fb2f18f8Sesaxe 
1050fb2f18f8Sesaxe 	ASSERT(pg_cpu != NULL);
1051fb2f18f8Sesaxe 
1052fb2f18f8Sesaxe 	/*
1053fb2f18f8Sesaxe 	 * The CPU belongs if, given the nature of the hardware sharing
1054fb2f18f8Sesaxe 	 * relationship represented by the PG, the CPU has that
1055fb2f18f8Sesaxe 	 * relationship with some other CPU already in the PG
1056fb2f18f8Sesaxe 	 */
1057fb2f18f8Sesaxe 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1058fb2f18f8Sesaxe 		return (1);
1059fb2f18f8Sesaxe 
1060fb2f18f8Sesaxe 	return (0);
1061fb2f18f8Sesaxe }
1062fb2f18f8Sesaxe 
1063fb2f18f8Sesaxe /*
1064*0e751525SEric Saxe  * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1065fb2f18f8Sesaxe  */
1066fb2f18f8Sesaxe static void
1067*0e751525SEric Saxe pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1068fb2f18f8Sesaxe {
1069*0e751525SEric Saxe 	int		i, j, inc;
1070*0e751525SEric Saxe 	pg_t		*tmp;
1071*0e751525SEric Saxe 	pg_t		**h = (pg_t **)hier;
1072fb2f18f8Sesaxe 
1073*0e751525SEric Saxe 	/*
1074*0e751525SEric Saxe 	 * First sort by number of CPUs
1075*0e751525SEric Saxe 	 */
1076*0e751525SEric Saxe 	inc = size / 2;
1077*0e751525SEric Saxe 	while (inc > 0) {
1078*0e751525SEric Saxe 		for (i = inc; i < size; i++) {
1079*0e751525SEric Saxe 			j = i;
1080*0e751525SEric Saxe 			tmp = h[i];
1081*0e751525SEric Saxe 			while ((j >= inc) &&
1082*0e751525SEric Saxe 			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
1083*0e751525SEric Saxe 				h[j] = h[j - inc];
1084*0e751525SEric Saxe 				j = j - inc;
1085*0e751525SEric Saxe 			}
1086*0e751525SEric Saxe 			h[j] = tmp;
1087*0e751525SEric Saxe 		}
1088*0e751525SEric Saxe 		if (inc == 2)
1089*0e751525SEric Saxe 			inc = 1;
1090*0e751525SEric Saxe 		else
1091*0e751525SEric Saxe 			inc = (inc * 5) / 11;
1092*0e751525SEric Saxe 	}
1093fb2f18f8Sesaxe 
1094*0e751525SEric Saxe 	/*
1095*0e751525SEric Saxe 	 * Break ties by asking the platform.
1096*0e751525SEric Saxe 	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
1097*0e751525SEric Saxe 	 */
1098*0e751525SEric Saxe 	for (i = 0; i < size - 1; i++) {
1099*0e751525SEric Saxe 		if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) &&
1100*0e751525SEric Saxe 		    pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) {
1101*0e751525SEric Saxe 			tmp = h[i];
1102*0e751525SEric Saxe 			h[i] = h[i + 1];
1103*0e751525SEric Saxe 			h[i + 1] = tmp;
1104fb2f18f8Sesaxe 		}
1105fb2f18f8Sesaxe 	}
1106fb2f18f8Sesaxe }
1107fb2f18f8Sesaxe 
1108fb2f18f8Sesaxe /*
1109fb2f18f8Sesaxe  * Return a cmt_lgrp_t * given an lgroup handle.
1110fb2f18f8Sesaxe  */
1111fb2f18f8Sesaxe static cmt_lgrp_t *
1112fb2f18f8Sesaxe pg_cmt_find_lgrp(lgrp_handle_t hand)
1113fb2f18f8Sesaxe {
1114fb2f18f8Sesaxe 	cmt_lgrp_t	*lgrp;
1115fb2f18f8Sesaxe 
1116fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
1117fb2f18f8Sesaxe 
1118fb2f18f8Sesaxe 	lgrp = cmt_lgrps;
1119fb2f18f8Sesaxe 	while (lgrp != NULL) {
1120fb2f18f8Sesaxe 		if (lgrp->cl_hand == hand)
1121a6604450Sesaxe 			break;
1122fb2f18f8Sesaxe 		lgrp = lgrp->cl_next;
1123fb2f18f8Sesaxe 	}
1124a6604450Sesaxe 	return (lgrp);
1125a6604450Sesaxe }
1126fb2f18f8Sesaxe 
1127fb2f18f8Sesaxe /*
1128a6604450Sesaxe  * Create a cmt_lgrp_t with the specified handle.
1129fb2f18f8Sesaxe  */
1130a6604450Sesaxe static cmt_lgrp_t *
1131a6604450Sesaxe pg_cmt_lgrp_create(lgrp_handle_t hand)
1132a6604450Sesaxe {
1133a6604450Sesaxe 	cmt_lgrp_t	*lgrp;
1134a6604450Sesaxe 
1135a6604450Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
1136a6604450Sesaxe 
1137fb2f18f8Sesaxe 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1138fb2f18f8Sesaxe 
1139fb2f18f8Sesaxe 	lgrp->cl_hand = hand;
1140fb2f18f8Sesaxe 	lgrp->cl_npgs = 0;
1141fb2f18f8Sesaxe 	lgrp->cl_next = cmt_lgrps;
1142fb2f18f8Sesaxe 	cmt_lgrps = lgrp;
1143fb2f18f8Sesaxe 	group_create(&lgrp->cl_pgs);
1144fb2f18f8Sesaxe 
1145fb2f18f8Sesaxe 	return (lgrp);
1146fb2f18f8Sesaxe }
11476890d023SEric Saxe 
11486890d023SEric Saxe /*
1149*0e751525SEric Saxe  * Interfaces to enable and disable power aware dispatching
1150*0e751525SEric Saxe  * The caller must be holding cpu_lock.
11516890d023SEric Saxe  *
1152*0e751525SEric Saxe  * Return 0 on success and -1 on failure.
11536890d023SEric Saxe  */
1154*0e751525SEric Saxe int
1155*0e751525SEric Saxe cmt_pad_enable(pghw_type_t type)
11566890d023SEric Saxe {
1157*0e751525SEric Saxe 	group_t		*hwset;
1158*0e751525SEric Saxe 	group_iter_t	iter;
1159*0e751525SEric Saxe 	pg_cmt_t	*pg;
11606890d023SEric Saxe 
1161*0e751525SEric Saxe 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1162*0e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
11636890d023SEric Saxe 
1164*0e751525SEric Saxe 	if ((hwset = pghw_set_lookup(type)) == NULL ||
1165*0e751525SEric Saxe 	    cmt_hw_blacklisted[type]) {
1166*0e751525SEric Saxe 		/*
1167*0e751525SEric Saxe 		 * Unable to find any instances of the specified type
1168*0e751525SEric Saxe 		 * of power domain, or the power domains have been blacklisted.
1169*0e751525SEric Saxe 		 */
1170*0e751525SEric Saxe 		return (-1);
1171*0e751525SEric Saxe 	}
11726890d023SEric Saxe 
11736890d023SEric Saxe 	/*
1174*0e751525SEric Saxe 	 * Iterate over the power domains, setting the default dispatcher
1175*0e751525SEric Saxe 	 * policy for power/performance optimization.
1176*0e751525SEric Saxe 	 *
1177*0e751525SEric Saxe 	 * Simply setting the policy isn't enough in the case where the power
1178*0e751525SEric Saxe 	 * domain is an only child of another PG. Because the dispatcher walks
1179*0e751525SEric Saxe 	 * the PG hierarchy in a top down fashion, the higher up PG's policy
1180*0e751525SEric Saxe 	 * will dominate. So promote the power domain above it's parent if both
1181*0e751525SEric Saxe 	 * PG and it's parent have the same CPUs to ensure it's policy
1182*0e751525SEric Saxe 	 * dominates.
11836890d023SEric Saxe 	 */
1184*0e751525SEric Saxe 	group_iter_init(&iter);
1185*0e751525SEric Saxe 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1186*0e751525SEric Saxe 		/*
1187*0e751525SEric Saxe 		 * If the power domain is an only child to a parent
1188*0e751525SEric Saxe 		 * not implementing the same policy, promote the child
1189*0e751525SEric Saxe 		 * above the parent to activate the policy.
1190*0e751525SEric Saxe 		 */
1191*0e751525SEric Saxe 		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
1192*0e751525SEric Saxe 		while ((pg->cmt_parent != NULL) &&
1193*0e751525SEric Saxe 		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
1194*0e751525SEric Saxe 		    (PG_NUM_CPUS((pg_t *)pg) ==
1195*0e751525SEric Saxe 		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
1196*0e751525SEric Saxe 			cmt_hier_promote(pg);
1197*0e751525SEric Saxe 		}
1198*0e751525SEric Saxe 	}
1199*0e751525SEric Saxe 
1200*0e751525SEric Saxe 	return (0);
1201*0e751525SEric Saxe }
1202*0e751525SEric Saxe 
1203*0e751525SEric Saxe int
1204*0e751525SEric Saxe cmt_pad_disable(pghw_type_t type)
1205*0e751525SEric Saxe {
1206*0e751525SEric Saxe 	group_t		*hwset;
1207*0e751525SEric Saxe 	group_iter_t	iter;
1208*0e751525SEric Saxe 	pg_cmt_t	*pg;
1209*0e751525SEric Saxe 	pg_cmt_t	*child;
1210*0e751525SEric Saxe 
1211*0e751525SEric Saxe 	ASSERT(PGHW_IS_PM_DOMAIN(type));
1212*0e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
1213*0e751525SEric Saxe 
1214*0e751525SEric Saxe 	if ((hwset = pghw_set_lookup(type)) == NULL) {
1215*0e751525SEric Saxe 		/*
1216*0e751525SEric Saxe 		 * Unable to find any instances of the specified type of
1217*0e751525SEric Saxe 		 * power domain.
1218*0e751525SEric Saxe 		 */
1219*0e751525SEric Saxe 		return (-1);
1220*0e751525SEric Saxe 	}
1221*0e751525SEric Saxe 	/*
1222*0e751525SEric Saxe 	 * Iterate over the power domains, setting the default dispatcher
1223*0e751525SEric Saxe 	 * policy for performance optimization (load balancing).
1224*0e751525SEric Saxe 	 */
1225*0e751525SEric Saxe 	group_iter_init(&iter);
1226*0e751525SEric Saxe 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
1227*0e751525SEric Saxe 
1228*0e751525SEric Saxe 		/*
1229*0e751525SEric Saxe 		 * If the power domain has an only child that implements
1230*0e751525SEric Saxe 		 * policy other than load balancing, promote the child
1231*0e751525SEric Saxe 		 * above the power domain to ensure it's policy dominates.
1232*0e751525SEric Saxe 		 */
1233*0e751525SEric Saxe 		if (GROUP_SIZE(pg->cmt_children) == 1) {
1234*0e751525SEric Saxe 			child = GROUP_ACCESS(pg->cmt_children, 0);
1235*0e751525SEric Saxe 			if ((child->cmt_policy & CMT_BALANCE) == 0) {
1236*0e751525SEric Saxe 				cmt_hier_promote(child);
1237*0e751525SEric Saxe 			}
1238*0e751525SEric Saxe 		}
1239*0e751525SEric Saxe 		pg->cmt_policy = CMT_BALANCE;
1240*0e751525SEric Saxe 	}
1241*0e751525SEric Saxe 	return (0);
1242*0e751525SEric Saxe }
1243*0e751525SEric Saxe 
1244*0e751525SEric Saxe /* ARGSUSED */
1245*0e751525SEric Saxe static void
1246*0e751525SEric Saxe cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1247*0e751525SEric Saxe 		    kthread_t *new)
1248*0e751525SEric Saxe {
1249*0e751525SEric Saxe 	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;
1250*0e751525SEric Saxe 
1251*0e751525SEric Saxe 	if (old == cp->cpu_idle_thread) {
1252*0e751525SEric Saxe 		atomic_add_32(&cmt_pg->cmt_utilization, 1);
1253*0e751525SEric Saxe 	} else if (new == cp->cpu_idle_thread) {
1254*0e751525SEric Saxe 		atomic_add_32(&cmt_pg->cmt_utilization, -1);
1255*0e751525SEric Saxe 	}
1256*0e751525SEric Saxe }
1257*0e751525SEric Saxe 
1258*0e751525SEric Saxe /*
1259*0e751525SEric Saxe  * Macro to test whether a thread is currently runnable on a CPU in a PG.
1260*0e751525SEric Saxe  */
1261*0e751525SEric Saxe #define	THREAD_RUNNABLE_IN_PG(t, pg)					\
1262*0e751525SEric Saxe 	((t)->t_state == TS_RUN &&					\
1263*0e751525SEric Saxe 	    (t)->t_disp_queue->disp_cpu &&				\
1264*0e751525SEric Saxe 	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
1265*0e751525SEric Saxe 	    (t)->t_disp_queue->disp_cpu->cpu_seqid))
1266*0e751525SEric Saxe 
1267*0e751525SEric Saxe static void
1268*0e751525SEric Saxe cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1269*0e751525SEric Saxe     kthread_t *new)
1270*0e751525SEric Saxe {
1271*0e751525SEric Saxe 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1272*0e751525SEric Saxe 	cpupm_domain_t	*dom;
1273*0e751525SEric Saxe 	uint32_t	u;
1274*0e751525SEric Saxe 
1275*0e751525SEric Saxe 	if (old == cp->cpu_idle_thread) {
1276*0e751525SEric Saxe 		ASSERT(new != cp->cpu_idle_thread);
1277*0e751525SEric Saxe 		u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
1278*0e751525SEric Saxe 		if (u == 1) {
1279*0e751525SEric Saxe 			/*
1280*0e751525SEric Saxe 			 * Notify the CPU power manager that the domain
1281*0e751525SEric Saxe 			 * is non-idle.
1282*0e751525SEric Saxe 			 */
1283*0e751525SEric Saxe 			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1284*0e751525SEric Saxe 			cpupm_utilization_event(cp, now, dom,
1285*0e751525SEric Saxe 			    CPUPM_DOM_BUSY_FROM_IDLE);
1286*0e751525SEric Saxe 		}
1287*0e751525SEric Saxe 	} else if (new == cp->cpu_idle_thread) {
1288*0e751525SEric Saxe 		ASSERT(old != cp->cpu_idle_thread);
1289*0e751525SEric Saxe 		u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
1290*0e751525SEric Saxe 		if (u == 0) {
1291*0e751525SEric Saxe 			/*
1292*0e751525SEric Saxe 			 * The domain is idle, notify the CPU power
1293*0e751525SEric Saxe 			 * manager.
1294*0e751525SEric Saxe 			 *
1295*0e751525SEric Saxe 			 * Avoid notifying if the thread is simply migrating
1296*0e751525SEric Saxe 			 * between CPUs in the domain.
1297*0e751525SEric Saxe 			 */
1298*0e751525SEric Saxe 			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
1299*0e751525SEric Saxe 				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1300*0e751525SEric Saxe 				cpupm_utilization_event(cp, now, dom,
1301*0e751525SEric Saxe 				    CPUPM_DOM_IDLE_FROM_BUSY);
1302*0e751525SEric Saxe 			}
1303*0e751525SEric Saxe 		}
1304*0e751525SEric Saxe 	}
1305*0e751525SEric Saxe }
1306*0e751525SEric Saxe 
1307*0e751525SEric Saxe /* ARGSUSED */
1308*0e751525SEric Saxe static void
1309*0e751525SEric Saxe cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
1310*0e751525SEric Saxe {
1311*0e751525SEric Saxe 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
1312*0e751525SEric Saxe 	cpupm_domain_t	*dom;
1313*0e751525SEric Saxe 
1314*0e751525SEric Saxe 	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1315*0e751525SEric Saxe 	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
1316*0e751525SEric Saxe }
1317*0e751525SEric Saxe 
1318*0e751525SEric Saxe /*
1319*0e751525SEric Saxe  * Return the name of the CMT scheduling policy
1320*0e751525SEric Saxe  * being implemented across this PG
1321*0e751525SEric Saxe  */
1322*0e751525SEric Saxe static char *
1323*0e751525SEric Saxe pg_cmt_policy_name(pg_t *pg)
1324*0e751525SEric Saxe {
1325*0e751525SEric Saxe 	pg_cmt_policy_t policy;
1326*0e751525SEric Saxe 
1327*0e751525SEric Saxe 	policy = ((pg_cmt_t *)pg)->cmt_policy;
1328*0e751525SEric Saxe 
1329*0e751525SEric Saxe 	if (policy & CMT_AFFINITY) {
1330*0e751525SEric Saxe 		if (policy & CMT_BALANCE)
1331*0e751525SEric Saxe 			return ("Load Balancing & Affinity");
1332*0e751525SEric Saxe 		else if (policy & CMT_COALESCE)
1333*0e751525SEric Saxe 			return ("Load Coalescence & Affinity");
13346890d023SEric Saxe 		else
1335*0e751525SEric Saxe 			return ("Affinity");
1336*0e751525SEric Saxe 	} else {
1337*0e751525SEric Saxe 		if (policy & CMT_BALANCE)
1338*0e751525SEric Saxe 			return ("Load Balancing");
1339*0e751525SEric Saxe 		else if (policy & CMT_COALESCE)
1340*0e751525SEric Saxe 			return ("Load Coalescence");
1341*0e751525SEric Saxe 		else
1342*0e751525SEric Saxe 			return ("None");
1343*0e751525SEric Saxe 	}
1344*0e751525SEric Saxe }
13456890d023SEric Saxe 
13466890d023SEric Saxe /*
1347*0e751525SEric Saxe  * Prune PG, and all other instances of PG's hardware sharing relationship
1348*0e751525SEric Saxe  * from the PG hierarchy.
13496890d023SEric Saxe  */
1350*0e751525SEric Saxe static int
1351*0e751525SEric Saxe pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz)
1352*0e751525SEric Saxe {
1353*0e751525SEric Saxe 	group_t		*hwset, *children;
1354*0e751525SEric Saxe 	int		i, j, r, size = *sz;
1355*0e751525SEric Saxe 	group_iter_t	hw_iter, child_iter;
1356*0e751525SEric Saxe 	pg_cpu_itr_t	cpu_iter;
1357*0e751525SEric Saxe 	pg_cmt_t	*pg, *child;
1358*0e751525SEric Saxe 	cpu_t		*cpu;
1359*0e751525SEric Saxe 	int		cap_needed;
1360*0e751525SEric Saxe 	pghw_type_t	hw;
13616890d023SEric Saxe 
1362*0e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
13636890d023SEric Saxe 
1364*0e751525SEric Saxe 	hw = ((pghw_t *)pg_bad)->pghw_hw;
1365*0e751525SEric Saxe 
1366*0e751525SEric Saxe 	if (hw == PGHW_POW_ACTIVE) {
1367*0e751525SEric Saxe 		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
1368*0e751525SEric Saxe 		    "Event Based CPUPM Unavailable");
1369*0e751525SEric Saxe 	} else if (hw == PGHW_POW_IDLE) {
1370*0e751525SEric Saxe 		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
1371*0e751525SEric Saxe 		    "Dispatcher assisted CPUPM disabled.");
1372*0e751525SEric Saxe 	}
13736890d023SEric Saxe 
13746890d023SEric Saxe 	/*
1375*0e751525SEric Saxe 	 * Find and eliminate the PG from the lineage.
13766890d023SEric Saxe 	 */
1377*0e751525SEric Saxe 	for (i = 0; i < size; i++) {
1378*0e751525SEric Saxe 		if (lineage[i] == pg_bad) {
1379*0e751525SEric Saxe 			for (j = i; j < size - 1; j++)
1380*0e751525SEric Saxe 				lineage[j] = lineage[j + 1];
1381*0e751525SEric Saxe 			*sz = size - 1;
1382*0e751525SEric Saxe 			break;
1383*0e751525SEric Saxe 		}
1384*0e751525SEric Saxe 	}
1385*0e751525SEric Saxe 
1386*0e751525SEric Saxe 	/*
1387*0e751525SEric Saxe 	 * We'll prune all instances of the hardware sharing relationship
1388*0e751525SEric Saxe 	 * represented by pg. But before we do that (and pause CPUs) we need
1389*0e751525SEric Saxe 	 * to ensure the hierarchy's groups are properly sized.
1390*0e751525SEric Saxe 	 */
1391*0e751525SEric Saxe 	hwset = pghw_set_lookup(hw);
1392*0e751525SEric Saxe 
1393*0e751525SEric Saxe 	/*
1394*0e751525SEric Saxe 	 * Blacklist the hardware so that future groups won't be created.
1395*0e751525SEric Saxe 	 */
1396*0e751525SEric Saxe 	cmt_hw_blacklisted[hw] = 1;
1397*0e751525SEric Saxe 
1398*0e751525SEric Saxe 	/*
1399*0e751525SEric Saxe 	 * For each of the PGs being pruned, ensure sufficient capacity in
1400*0e751525SEric Saxe 	 * the siblings set for the PG's children
1401*0e751525SEric Saxe 	 */
1402*0e751525SEric Saxe 	group_iter_init(&hw_iter);
1403*0e751525SEric Saxe 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1404*0e751525SEric Saxe 		/*
1405*0e751525SEric Saxe 		 * PG is being pruned, but if it is bringing up more than
1406*0e751525SEric Saxe 		 * one child, ask for more capacity in the siblings group.
1407*0e751525SEric Saxe 		 */
1408*0e751525SEric Saxe 		cap_needed = 0;
1409*0e751525SEric Saxe 		if (pg->cmt_children &&
1410*0e751525SEric Saxe 		    GROUP_SIZE(pg->cmt_children) > 1) {
1411*0e751525SEric Saxe 			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
1412*0e751525SEric Saxe 
1413*0e751525SEric Saxe 			group_expand(pg->cmt_siblings,
1414*0e751525SEric Saxe 			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);
1415*0e751525SEric Saxe 
1416*0e751525SEric Saxe 			/*
1417*0e751525SEric Saxe 			 * If this is a top level group, also ensure the
1418*0e751525SEric Saxe 			 * capacity in the root lgrp level CMT grouping.
1419*0e751525SEric Saxe 			 */
1420*0e751525SEric Saxe 			if (pg->cmt_parent == NULL &&
1421*0e751525SEric Saxe 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
1422*0e751525SEric Saxe 				group_expand(&cmt_root->cl_pgs,
1423*0e751525SEric Saxe 				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1424*0e751525SEric Saxe 			}
1425*0e751525SEric Saxe 		}
1426*0e751525SEric Saxe 	}
1427*0e751525SEric Saxe 
1428*0e751525SEric Saxe 	/*
1429*0e751525SEric Saxe 	 * We're operating on the PG hierarchy. Pause CPUs to ensure
1430*0e751525SEric Saxe 	 * exclusivity with respect to the dispatcher.
1431*0e751525SEric Saxe 	 */
1432*0e751525SEric Saxe 	pause_cpus(NULL);
1433*0e751525SEric Saxe 
1434*0e751525SEric Saxe 	/*
1435*0e751525SEric Saxe 	 * Prune all PG instances of the hardware sharing relationship
1436*0e751525SEric Saxe 	 * represented by pg.
1437*0e751525SEric Saxe 	 */
1438*0e751525SEric Saxe 	group_iter_init(&hw_iter);
1439*0e751525SEric Saxe 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1440*0e751525SEric Saxe 
1441*0e751525SEric Saxe 		/*
1442*0e751525SEric Saxe 		 * Remove PG from it's group of siblings, if it's there.
1443*0e751525SEric Saxe 		 */
1444*0e751525SEric Saxe 		if (pg->cmt_siblings) {
1445*0e751525SEric Saxe 			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1446*0e751525SEric Saxe 		}
1447*0e751525SEric Saxe 		if (pg->cmt_parent == NULL &&
1448*0e751525SEric Saxe 		    pg->cmt_siblings != &cmt_root->cl_pgs) {
1449*0e751525SEric Saxe 			(void) group_remove(&cmt_root->cl_pgs, pg,
1450*0e751525SEric Saxe 			    GRP_NORESIZE);
1451*0e751525SEric Saxe 		}
1452*0e751525SEric Saxe 		/*
1453*0e751525SEric Saxe 		 * Add PGs children to it's group of siblings.
1454*0e751525SEric Saxe 		 */
1455*0e751525SEric Saxe 		if (pg->cmt_children != NULL) {
1456*0e751525SEric Saxe 			children = pg->cmt_children;
1457*0e751525SEric Saxe 
1458*0e751525SEric Saxe 			group_iter_init(&child_iter);
1459*0e751525SEric Saxe 			while ((child = group_iterate(children, &child_iter))
1460*0e751525SEric Saxe 			    != NULL) {
1461*0e751525SEric Saxe 				/*
1462*0e751525SEric Saxe 				 * Transplant child from it's siblings set to
1463*0e751525SEric Saxe 				 * PGs.
1464*0e751525SEric Saxe 				 */
1465*0e751525SEric Saxe 				if (pg->cmt_siblings != NULL &&
1466*0e751525SEric Saxe 				    child->cmt_siblings != NULL &&
1467*0e751525SEric Saxe 				    group_remove(child->cmt_siblings, child,
1468*0e751525SEric Saxe 				    GRP_NORESIZE) != -1) {
1469*0e751525SEric Saxe 					r = group_add(pg->cmt_siblings, child,
1470*0e751525SEric Saxe 					    GRP_NORESIZE);
1471*0e751525SEric Saxe 					ASSERT(r == 0);
1472*0e751525SEric Saxe 				}
1473*0e751525SEric Saxe 			}
1474*0e751525SEric Saxe 		}
1475*0e751525SEric Saxe 
1476*0e751525SEric Saxe 		/*
1477*0e751525SEric Saxe 		 * Reset the callbacks to the defaults
1478*0e751525SEric Saxe 		 */
1479*0e751525SEric Saxe 		pg_callback_set_defaults((pg_t *)pg);
1480*0e751525SEric Saxe 
1481*0e751525SEric Saxe 		/*
1482*0e751525SEric Saxe 		 * Update all the CPU lineages in each of PG's CPUs
1483*0e751525SEric Saxe 		 */
1484*0e751525SEric Saxe 		PG_CPU_ITR_INIT(pg, cpu_iter);
1485*0e751525SEric Saxe 		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1486*0e751525SEric Saxe 			group_t		*pgs;
1487*0e751525SEric Saxe 			pg_cmt_t	*cpu_pg;
1488*0e751525SEric Saxe 			group_iter_t	liter;	/* Iterator for the lineage */
1489*0e751525SEric Saxe 
1490*0e751525SEric Saxe 			/*
1491*0e751525SEric Saxe 			 * Iterate over the CPU's PGs updating the children
1492*0e751525SEric Saxe 			 * of the PG being promoted, since they have a new
1493*0e751525SEric Saxe 			 * parent and siblings set.
1494*0e751525SEric Saxe 			 */
1495*0e751525SEric Saxe 			pgs = &cpu->cpu_pg->pgs;
1496*0e751525SEric Saxe 			group_iter_init(&liter);
1497*0e751525SEric Saxe 			while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) {
1498*0e751525SEric Saxe 				if (cpu_pg->cmt_parent == pg) {
1499*0e751525SEric Saxe 					cpu_pg->cmt_parent = pg->cmt_parent;
1500*0e751525SEric Saxe 					cpu_pg->cmt_siblings = pg->cmt_siblings;
1501*0e751525SEric Saxe 				}
1502*0e751525SEric Saxe 			}
1503*0e751525SEric Saxe 
1504*0e751525SEric Saxe 			/*
1505*0e751525SEric Saxe 			 * Update the CPU's lineages
1506*0e751525SEric Saxe 			 */
1507*0e751525SEric Saxe 			pgs = &cpu->cpu_pg->cmt_pgs;
1508*0e751525SEric Saxe 			(void) group_remove(pgs, pg, GRP_NORESIZE);
1509*0e751525SEric Saxe 			pgs = &cpu->cpu_pg->pgs;
1510*0e751525SEric Saxe 			(void) group_remove(pgs, pg, GRP_NORESIZE);
1511*0e751525SEric Saxe 		}
1512*0e751525SEric Saxe 	}
1513*0e751525SEric Saxe 	start_cpus();
1514*0e751525SEric Saxe 	return (0);
1515*0e751525SEric Saxe }
1516*0e751525SEric Saxe 
1517*0e751525SEric Saxe /*
1518*0e751525SEric Saxe  * Disable CMT scheduling
1519*0e751525SEric Saxe  */
1520*0e751525SEric Saxe static void
1521*0e751525SEric Saxe pg_cmt_disable(void)
1522*0e751525SEric Saxe {
1523*0e751525SEric Saxe 	cpu_t	*cpu;
1524*0e751525SEric Saxe 
1525*0e751525SEric Saxe 	pause_cpus(NULL);
1526*0e751525SEric Saxe 	cpu = cpu_list;
1527*0e751525SEric Saxe 
15286890d023SEric Saxe 	do {
1529*0e751525SEric Saxe 		if (cpu->cpu_pg)
1530*0e751525SEric Saxe 			group_empty(&cpu->cpu_pg->cmt_pgs);
1531*0e751525SEric Saxe 	} while ((cpu = cpu->cpu_next) != cpu_list);
1532*0e751525SEric Saxe 
1533*0e751525SEric Saxe 	cmt_sched_disabled = 1;
1534*0e751525SEric Saxe 	start_cpus();
1535*0e751525SEric Saxe 	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
1536*0e751525SEric Saxe }
1537*0e751525SEric Saxe 
1538*0e751525SEric Saxe static int
1539*0e751525SEric Saxe pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz)
1540*0e751525SEric Saxe {
1541*0e751525SEric Saxe 	int		i, size;
1542*0e751525SEric Saxe 	pg_cmt_t	*pg, *parent, *pg_bad;
1543*0e751525SEric Saxe 	cpu_t		*cp;
1544*0e751525SEric Saxe 	pg_cpu_itr_t	cpu_iter;
1545*0e751525SEric Saxe 
1546*0e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
1547*0e751525SEric Saxe 
1548*0e751525SEric Saxe revalidate:
1549*0e751525SEric Saxe 	size = *sz;
1550*0e751525SEric Saxe 	pg_bad = NULL;
1551*0e751525SEric Saxe 	for (i = 0; i < size - 1; i++) {
1552*0e751525SEric Saxe 
1553*0e751525SEric Saxe 		pg = lineage[i];
1554*0e751525SEric Saxe 		parent = lineage[i + 1];
15556890d023SEric Saxe 
15566890d023SEric Saxe 		/*
1557*0e751525SEric Saxe 		 * We assume that the lineage has already been sorted
1558*0e751525SEric Saxe 		 * by the number of CPUs. In fact, we depend on it.
15596890d023SEric Saxe 		 */
1560*0e751525SEric Saxe 		ASSERT(PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)parent));
15616890d023SEric Saxe 
15626890d023SEric Saxe 		/*
1563*0e751525SEric Saxe 		 * Walk each of the CPUs in the PGs group, and verify that
1564*0e751525SEric Saxe 		 * the next larger PG contains at least the CPUs in this one.
15656890d023SEric Saxe 		 */
1566*0e751525SEric Saxe 		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
1567*0e751525SEric Saxe 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1568*0e751525SEric Saxe 			if (pg_cpu_find((pg_t *)parent, cp) == B_FALSE) {
1569*0e751525SEric Saxe 				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
1570*0e751525SEric Saxe 				goto handle_error;
15716890d023SEric Saxe 			}
1572*0e751525SEric Saxe 		}
15736890d023SEric Saxe 	}
15746890d023SEric Saxe 
1575*0e751525SEric Saxe handle_error:
1576*0e751525SEric Saxe 	switch (cmt_lineage_status) {
1577*0e751525SEric Saxe 	case CMT_LINEAGE_VALID:
1578*0e751525SEric Saxe 	case CMT_LINEAGE_REPAIRED:
1579*0e751525SEric Saxe 		break;
1580*0e751525SEric Saxe 	case CMT_LINEAGE_NON_CONCENTRIC:
15816890d023SEric Saxe 		/*
1582*0e751525SEric Saxe 		 * We've detected a non-concentric PG lineage.
1583*0e751525SEric Saxe 		 *
1584*0e751525SEric Saxe 		 * This can happen when some of the CPU grouping information
1585*0e751525SEric Saxe 		 * is derived from buggy sources (for example, incorrect ACPI
1586*0e751525SEric Saxe 		 * tables on x86 systems).
1587*0e751525SEric Saxe 		 *
1588*0e751525SEric Saxe 		 * We attempt to recover from this by pruning out the
1589*0e751525SEric Saxe 		 * illegal groupings from the PG hierarchy, which means that
1590*0e751525SEric Saxe 		 * we won't optimize for those levels, but we will for the
1591*0e751525SEric Saxe 		 * remaining ones.
1592*0e751525SEric Saxe 		 *
1593*0e751525SEric Saxe 		 * If a given level has CPUs not found in it's parent, then
1594*0e751525SEric Saxe 		 * we examine the PG and it's parent to see if either grouping
1595*0e751525SEric Saxe 		 * is enumerated from potentially buggy sources.
1596*0e751525SEric Saxe 		 *
1597*0e751525SEric Saxe 		 * If one has less CPUs than the other, and contains CPUs
1598*0e751525SEric Saxe 		 * not found in the parent, and it is an untrusted enumeration,
1599*0e751525SEric Saxe 		 * then prune it. If both have the same number of CPUs, then
1600*0e751525SEric Saxe 		 * prune the one that is untrusted.
1601*0e751525SEric Saxe 		 *
1602*0e751525SEric Saxe 		 * This process repeats until we have a concentric lineage,
1603*0e751525SEric Saxe 		 * or we would have to prune out level derived from what we
1604*0e751525SEric Saxe 		 * thought was a reliable source, in which case CMT scheduling
1605*0e751525SEric Saxe 		 * is disabled all together.
16066890d023SEric Saxe 		 */
1607*0e751525SEric Saxe 		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)parent)) &&
1608*0e751525SEric Saxe 		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
1609*0e751525SEric Saxe 			pg_bad = pg;
1610*0e751525SEric Saxe 		} else if (PG_NUM_CPUS((pg_t *)pg) ==
1611*0e751525SEric Saxe 		    PG_NUM_CPUS((pg_t *)parent)) {
1612*0e751525SEric Saxe 			if (PG_CMT_HW_SUSPECT(((pghw_t *)parent)->pghw_hw)) {
1613*0e751525SEric Saxe 				pg_bad = parent;
1614*0e751525SEric Saxe 			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1615*0e751525SEric Saxe 				pg_bad = pg;
16166890d023SEric Saxe 			}
16176890d023SEric Saxe 		}
1618*0e751525SEric Saxe 		if (pg_bad) {
1619*0e751525SEric Saxe 			if (pg_cmt_prune(pg_bad, lineage, sz) == 0) {
1620*0e751525SEric Saxe 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1621*0e751525SEric Saxe 				goto revalidate;
1622*0e751525SEric Saxe 			}
1623*0e751525SEric Saxe 		}
1624*0e751525SEric Saxe 		/*FALLTHROUGH*/
1625*0e751525SEric Saxe 	default:
1626*0e751525SEric Saxe 		/*
1627*0e751525SEric Saxe 		 * If we're here, something has gone wrong in trying to
1628*0e751525SEric Saxe 		 * recover from a illegal PG hierarchy, or we've encountered
1629*0e751525SEric Saxe 		 * a validation error for which we don't know how to recover.
1630*0e751525SEric Saxe 		 * In this case, disable CMT scheduling all together.
1631*0e751525SEric Saxe 		 */
1632*0e751525SEric Saxe 		pg_cmt_disable();
1633*0e751525SEric Saxe 		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1634*0e751525SEric Saxe 		return (-1);
1635*0e751525SEric Saxe 	}
1636*0e751525SEric Saxe 	return (0);
16376890d023SEric Saxe }
1638