xref: /titanic_51/usr/src/uts/common/disp/cmt.c (revision b885580b43755ee4ea1e280b85428893d2ba9291)
1fb2f18f8Sesaxe /*
2fb2f18f8Sesaxe  * CDDL HEADER START
3fb2f18f8Sesaxe  *
4fb2f18f8Sesaxe  * The contents of this file are subject to the terms of the
5fb2f18f8Sesaxe  * Common Development and Distribution License (the "License").
6fb2f18f8Sesaxe  * You may not use this file except in compliance with the License.
7fb2f18f8Sesaxe  *
8fb2f18f8Sesaxe  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fb2f18f8Sesaxe  * or http://www.opensolaris.org/os/licensing.
10fb2f18f8Sesaxe  * See the License for the specific language governing permissions
11fb2f18f8Sesaxe  * and limitations under the License.
12fb2f18f8Sesaxe  *
13fb2f18f8Sesaxe  * When distributing Covered Code, include this CDDL HEADER in each
14fb2f18f8Sesaxe  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fb2f18f8Sesaxe  * If applicable, add the following below this CDDL HEADER, with the
16fb2f18f8Sesaxe  * fields enclosed by brackets "[]" replaced with your own identifying
17fb2f18f8Sesaxe  * information: Portions Copyright [yyyy] [name of copyright owner]
18fb2f18f8Sesaxe  *
19fb2f18f8Sesaxe  * CDDL HEADER END
20fb2f18f8Sesaxe  */
21fb2f18f8Sesaxe /*
223e81cacfSEric Saxe  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23fb2f18f8Sesaxe  * Use is subject to license terms.
24fb2f18f8Sesaxe  */
25fb2f18f8Sesaxe 
26fb2f18f8Sesaxe #include <sys/systm.h>
27fb2f18f8Sesaxe #include <sys/types.h>
28fb2f18f8Sesaxe #include <sys/param.h>
29fb2f18f8Sesaxe #include <sys/thread.h>
30fb2f18f8Sesaxe #include <sys/cpuvar.h>
31fb2f18f8Sesaxe #include <sys/cpupart.h>
32fb2f18f8Sesaxe #include <sys/kmem.h>
33fb2f18f8Sesaxe #include <sys/cmn_err.h>
34fb2f18f8Sesaxe #include <sys/kstat.h>
35fb2f18f8Sesaxe #include <sys/processor.h>
36fb2f18f8Sesaxe #include <sys/disp.h>
37fb2f18f8Sesaxe #include <sys/group.h>
38fb2f18f8Sesaxe #include <sys/pghw.h>
39fb2f18f8Sesaxe #include <sys/bitset.h>
40fb2f18f8Sesaxe #include <sys/lgrp.h>
41fb2f18f8Sesaxe #include <sys/cmt.h>
420e751525SEric Saxe #include <sys/cpu_pm.h>
43fb2f18f8Sesaxe 
44fb2f18f8Sesaxe /*
45fb2f18f8Sesaxe  * CMT scheduler / dispatcher support
46fb2f18f8Sesaxe  *
47fb2f18f8Sesaxe  * This file implements CMT scheduler support using Processor Groups.
48fb2f18f8Sesaxe  * The CMT processor group class creates and maintains the CMT class
49fb2f18f8Sesaxe  * specific processor group pg_cmt_t.
50fb2f18f8Sesaxe  *
51fb2f18f8Sesaxe  * ---------------------------- <-- pg_cmt_t *
52fb2f18f8Sesaxe  * | pghw_t                   |
53fb2f18f8Sesaxe  * ----------------------------
54fb2f18f8Sesaxe  * | CMT class specific data  |
55fb2f18f8Sesaxe  * | - hierarchy linkage      |
56fb2f18f8Sesaxe  * | - CMT load balancing data|
57fb2f18f8Sesaxe  * | - active CPU group/bitset|
58fb2f18f8Sesaxe  * ----------------------------
59fb2f18f8Sesaxe  *
60fb2f18f8Sesaxe  * The scheduler/dispatcher leverages knowledge of the performance
61fb2f18f8Sesaxe  * relevant CMT sharing relationships existing between cpus to implement
620e751525SEric Saxe  * optimized affinity, load balancing, and coalescence policies.
63fb2f18f8Sesaxe  *
64fb2f18f8Sesaxe  * Load balancing policy seeks to improve performance by minimizing
650e751525SEric Saxe  * contention over shared processor resources / facilities, Affinity
660e751525SEric Saxe  * policies seek to improve cache and TLB utilization. Coalescence
670e751525SEric Saxe  * policies improve resource utilization and ultimately power efficiency.
68fb2f18f8Sesaxe  *
69fb2f18f8Sesaxe  * The CMT PGs created by this class are already arranged into a
70fb2f18f8Sesaxe  * hierarchy (which is done in the pghw layer). To implement the top-down
71fb2f18f8Sesaxe  * CMT load balancing algorithm, the CMT PGs additionally maintain
72fb2f18f8Sesaxe  * parent, child and sibling hierarchy relationships.
73fb2f18f8Sesaxe  * Parent PGs always contain a superset of their children(s) resources,
74fb2f18f8Sesaxe  * each PG can have at most one parent, and siblings are the group of PGs
75fb2f18f8Sesaxe  * sharing the same parent.
76fb2f18f8Sesaxe  *
77d0e93b69SEric Saxe  * On UMA based systems, the CMT load balancing algorithm begins by balancing
78d0e93b69SEric Saxe  * load across the group of top level PGs in the system hierarchy.
79d0e93b69SEric Saxe  * On NUMA systems, the CMT load balancing algorithm balances load across the
80d0e93b69SEric Saxe  * group of top level PGs in each leaf lgroup...but for root homed threads,
81d0e93b69SEric Saxe  * is willing to balance against all the top level PGs in the system.
82d0e93b69SEric Saxe  *
83d0e93b69SEric Saxe  * Groups of top level PGs are maintained to implement the above, one for each
84d0e93b69SEric Saxe  * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
85d0e93b69SEric Saxe  * root lgroup) that contains all the top level PGs in the system.
86fb2f18f8Sesaxe  */
87a6604450Sesaxe static cmt_lgrp_t	*cmt_lgrps = NULL;	/* cmt_lgrps list head */
88a6604450Sesaxe static cmt_lgrp_t	*cpu0_lgrp = NULL;	/* boot CPU's initial lgrp */
89a6604450Sesaxe 						/* used for null_proc_lpa */
900e751525SEric Saxe cmt_lgrp_t		*cmt_root = NULL;	/* Reference to root cmt pg */
91fb2f18f8Sesaxe 
92a6604450Sesaxe static int		is_cpu0 = 1; /* true if this is boot CPU context */
93a6604450Sesaxe 
94a6604450Sesaxe /*
950e751525SEric Saxe  * Array of hardware sharing relationships that are blacklisted.
96d0e93b69SEric Saxe  * CMT scheduling optimizations won't be performed for blacklisted sharing
97d0e93b69SEric Saxe  * relationships.
980e751525SEric Saxe  */
990e751525SEric Saxe static int		cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
1000e751525SEric Saxe 
1010e751525SEric Saxe /*
102a6604450Sesaxe  * Set this to non-zero to disable CMT scheduling
103a6604450Sesaxe  * This must be done via kmdb -d, as /etc/system will be too late
104a6604450Sesaxe  */
1050e751525SEric Saxe int			cmt_sched_disabled = 0;
106fb2f18f8Sesaxe 
107ef4f35d8SEric Saxe /*
108ef4f35d8SEric Saxe  * Status codes for CMT lineage validation
109ef4f35d8SEric Saxe  * See pg_cmt_lineage_validate() below
110ef4f35d8SEric Saxe  */
111ef4f35d8SEric Saxe typedef enum cmt_lineage_validation {
112ef4f35d8SEric Saxe 	CMT_LINEAGE_VALID,
113ef4f35d8SEric Saxe 	CMT_LINEAGE_NON_CONCENTRIC,
114ef4f35d8SEric Saxe 	CMT_LINEAGE_PG_SPANS_LGRPS,
115ef4f35d8SEric Saxe 	CMT_LINEAGE_NON_PROMOTABLE,
116ef4f35d8SEric Saxe 	CMT_LINEAGE_REPAIRED,
117ef4f35d8SEric Saxe 	CMT_LINEAGE_UNRECOVERABLE
118ef4f35d8SEric Saxe } cmt_lineage_validation_t;
119ef4f35d8SEric Saxe 
120ef4f35d8SEric Saxe /*
121ef4f35d8SEric Saxe  * Status of the current lineage under construction.
122ef4f35d8SEric Saxe  * One must be holding cpu_lock to change this.
123ef4f35d8SEric Saxe  */
124ef4f35d8SEric Saxe cmt_lineage_validation_t	cmt_lineage_status = CMT_LINEAGE_VALID;
125ef4f35d8SEric Saxe 
126ef4f35d8SEric Saxe /*
127ef4f35d8SEric Saxe  * Power domain definitions (on x86) are defined by ACPI, and
128ef4f35d8SEric Saxe  * therefore may be subject to BIOS bugs.
129ef4f35d8SEric Saxe  */
130ef4f35d8SEric Saxe #define	PG_CMT_HW_SUSPECT(hw)	PGHW_IS_PM_DOMAIN(hw)
131ef4f35d8SEric Saxe 
132ef4f35d8SEric Saxe /*
133ef4f35d8SEric Saxe  * Macro to test if PG is managed by the CMT PG class
134ef4f35d8SEric Saxe  */
135ef4f35d8SEric Saxe #define	IS_CMT_PG(pg)	(((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
136ef4f35d8SEric Saxe 
137fb2f18f8Sesaxe static pg_cid_t		pg_cmt_class_id;		/* PG class id */
138fb2f18f8Sesaxe 
139fb2f18f8Sesaxe static pg_t		*pg_cmt_alloc();
140fb2f18f8Sesaxe static void		pg_cmt_free(pg_t *);
14147ab0c7cSEric Saxe static void		pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
14247ab0c7cSEric Saxe static void		pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
143fb2f18f8Sesaxe static void		pg_cmt_cpu_active(cpu_t *);
144fb2f18f8Sesaxe static void		pg_cmt_cpu_inactive(cpu_t *);
145fb2f18f8Sesaxe static void		pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
146fb2f18f8Sesaxe static void		pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
1470e751525SEric Saxe static char		*pg_cmt_policy_name(pg_t *);
1480e751525SEric Saxe static void		pg_cmt_hier_sort(pg_cmt_t **, int);
1490e751525SEric Saxe static pg_cmt_t		*pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
150fb2f18f8Sesaxe static int		pg_cmt_cpu_belongs(pg_t *, cpu_t *);
151fb2f18f8Sesaxe static int		pg_cmt_hw(pghw_type_t);
152fb2f18f8Sesaxe static cmt_lgrp_t	*pg_cmt_find_lgrp(lgrp_handle_t);
153a6604450Sesaxe static cmt_lgrp_t	*pg_cmt_lgrp_create(lgrp_handle_t);
1540e751525SEric Saxe static void		cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
1550e751525SEric Saxe 			    kthread_t *, kthread_t *);
1560e751525SEric Saxe static void		cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
1570e751525SEric Saxe 			    kthread_t *, kthread_t *);
1580e751525SEric Saxe static void		cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
1591a77c24bSEric Saxe static cmt_lineage_validation_t	pg_cmt_lineage_validate(pg_cmt_t **, int *,
1601a77c24bSEric Saxe 			    cpu_pg_t *);
161fb2f18f8Sesaxe 
1620e751525SEric Saxe /*
163fb2f18f8Sesaxe  * CMT PG ops
164fb2f18f8Sesaxe  */
165fb2f18f8Sesaxe struct pg_ops pg_ops_cmt = {
166fb2f18f8Sesaxe 	pg_cmt_alloc,
167fb2f18f8Sesaxe 	pg_cmt_free,
168fb2f18f8Sesaxe 	pg_cmt_cpu_init,
169fb2f18f8Sesaxe 	pg_cmt_cpu_fini,
170fb2f18f8Sesaxe 	pg_cmt_cpu_active,
171fb2f18f8Sesaxe 	pg_cmt_cpu_inactive,
172fb2f18f8Sesaxe 	pg_cmt_cpupart_in,
173fb2f18f8Sesaxe 	NULL,			/* cpupart_out */
174fb2f18f8Sesaxe 	pg_cmt_cpupart_move,
175fb2f18f8Sesaxe 	pg_cmt_cpu_belongs,
1760e751525SEric Saxe 	pg_cmt_policy_name,
177fb2f18f8Sesaxe };
178fb2f18f8Sesaxe 
179fb2f18f8Sesaxe /*
180fb2f18f8Sesaxe  * Initialize the CMT PG class
181fb2f18f8Sesaxe  */
182fb2f18f8Sesaxe void
183fb2f18f8Sesaxe pg_cmt_class_init(void)
184fb2f18f8Sesaxe {
185fb2f18f8Sesaxe 	if (cmt_sched_disabled)
186fb2f18f8Sesaxe 		return;
187fb2f18f8Sesaxe 
188fb2f18f8Sesaxe 	pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
189fb2f18f8Sesaxe }
190fb2f18f8Sesaxe 
191fb2f18f8Sesaxe /*
192fb2f18f8Sesaxe  * Called to indicate a new CPU has started up so
193fb2f18f8Sesaxe  * that either t0 or the slave startup thread can
194fb2f18f8Sesaxe  * be accounted for.
195fb2f18f8Sesaxe  */
196fb2f18f8Sesaxe void
197fb2f18f8Sesaxe pg_cmt_cpu_startup(cpu_t *cp)
198fb2f18f8Sesaxe {
1990e751525SEric Saxe 	pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
2000e751525SEric Saxe 	    cp->cpu_thread);
201fb2f18f8Sesaxe }
202fb2f18f8Sesaxe 
203fb2f18f8Sesaxe /*
204fb2f18f8Sesaxe  * Return non-zero if thread can migrate between "from" and "to"
205fb2f18f8Sesaxe  * without a performance penalty
206fb2f18f8Sesaxe  */
207fb2f18f8Sesaxe int
208fb2f18f8Sesaxe pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
209fb2f18f8Sesaxe {
210fb2f18f8Sesaxe 	if (from->cpu_physid->cpu_cacheid ==
211fb2f18f8Sesaxe 	    to->cpu_physid->cpu_cacheid)
212fb2f18f8Sesaxe 		return (1);
213fb2f18f8Sesaxe 	return (0);
214fb2f18f8Sesaxe }
215fb2f18f8Sesaxe 
216fb2f18f8Sesaxe /*
217fb2f18f8Sesaxe  * CMT class specific PG allocation
218fb2f18f8Sesaxe  */
219fb2f18f8Sesaxe static pg_t *
220fb2f18f8Sesaxe pg_cmt_alloc(void)
221fb2f18f8Sesaxe {
222fb2f18f8Sesaxe 	return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
223fb2f18f8Sesaxe }
224fb2f18f8Sesaxe 
225fb2f18f8Sesaxe /*
226fb2f18f8Sesaxe  * Class specific PG de-allocation
227fb2f18f8Sesaxe  */
228fb2f18f8Sesaxe static void
229fb2f18f8Sesaxe pg_cmt_free(pg_t *pg)
230fb2f18f8Sesaxe {
231fb2f18f8Sesaxe 	ASSERT(pg != NULL);
232fb2f18f8Sesaxe 	ASSERT(IS_CMT_PG(pg));
233fb2f18f8Sesaxe 
234fb2f18f8Sesaxe 	kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
235fb2f18f8Sesaxe }
236fb2f18f8Sesaxe 
237fb2f18f8Sesaxe /*
2380e751525SEric Saxe  * Given a hardware sharing relationship, return which dispatcher
2390e751525SEric Saxe  * policies should be implemented to optimize performance and efficiency
240fb2f18f8Sesaxe  */
2410e751525SEric Saxe static pg_cmt_policy_t
2420e751525SEric Saxe pg_cmt_policy(pghw_type_t hw)
243fb2f18f8Sesaxe {
2440e751525SEric Saxe 	pg_cmt_policy_t p;
2450e751525SEric Saxe 
2460e751525SEric Saxe 	/*
2470e751525SEric Saxe 	 * Give the platform a chance to override the default
2480e751525SEric Saxe 	 */
2490e751525SEric Saxe 	if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
2500e751525SEric Saxe 		return (p);
2510e751525SEric Saxe 
2520e751525SEric Saxe 	switch (hw) {
2530e751525SEric Saxe 	case PGHW_IPIPE:
2540e751525SEric Saxe 	case PGHW_FPU:
2558031591dSSrihari Venkatesan 	case PGHW_PROCNODE:
2560e751525SEric Saxe 	case PGHW_CHIP:
2570e751525SEric Saxe 		return (CMT_BALANCE);
2580e751525SEric Saxe 	case PGHW_CACHE:
2590e751525SEric Saxe 		return (CMT_AFFINITY);
2600e751525SEric Saxe 	case PGHW_POW_ACTIVE:
2610e751525SEric Saxe 	case PGHW_POW_IDLE:
2620e751525SEric Saxe 		return (CMT_BALANCE);
2630e751525SEric Saxe 	default:
2640e751525SEric Saxe 		return (CMT_NO_POLICY);
2650e751525SEric Saxe 	}
2660e751525SEric Saxe }
2670e751525SEric Saxe 
2680e751525SEric Saxe /*
2690e751525SEric Saxe  * Rank the importance of optimizing for the pg1 relationship vs.
2700e751525SEric Saxe  * the pg2 relationship.
2710e751525SEric Saxe  */
2720e751525SEric Saxe static pg_cmt_t *
2730e751525SEric Saxe pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
2740e751525SEric Saxe {
2750e751525SEric Saxe 	pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
2760e751525SEric Saxe 	pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
2770e751525SEric Saxe 
2780e751525SEric Saxe 	/*
2790e751525SEric Saxe 	 * A power domain is only important if CPUPM is enabled.
2800e751525SEric Saxe 	 */
2810e751525SEric Saxe 	if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
2820e751525SEric Saxe 		if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
2830e751525SEric Saxe 			return (pg2);
2840e751525SEric Saxe 		if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
2850e751525SEric Saxe 			return (pg1);
2860e751525SEric Saxe 	}
2870e751525SEric Saxe 
2880e751525SEric Saxe 	/*
2890e751525SEric Saxe 	 * Otherwise, ask the platform
2900e751525SEric Saxe 	 */
2910e751525SEric Saxe 	if (pg_plat_hw_rank(hw1, hw2) == hw1)
2920e751525SEric Saxe 		return (pg1);
2930e751525SEric Saxe 	else
2940e751525SEric Saxe 		return (pg2);
2950e751525SEric Saxe }
2960e751525SEric Saxe 
2970e751525SEric Saxe /*
2980e751525SEric Saxe  * Initialize CMT callbacks for the given PG
2990e751525SEric Saxe  */
3000e751525SEric Saxe static void
3010e751525SEric Saxe cmt_callback_init(pg_t *pg)
3020e751525SEric Saxe {
303d0e93b69SEric Saxe 	/*
304d0e93b69SEric Saxe 	 * Stick with the default callbacks if there isn't going to be
305d0e93b69SEric Saxe 	 * any CMT thread placement optimizations implemented.
306d0e93b69SEric Saxe 	 */
307d0e93b69SEric Saxe 	if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY)
308d0e93b69SEric Saxe 		return;
309d0e93b69SEric Saxe 
3100e751525SEric Saxe 	switch (((pghw_t *)pg)->pghw_hw) {
3110e751525SEric Saxe 	case PGHW_POW_ACTIVE:
3120e751525SEric Saxe 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
3130e751525SEric Saxe 		pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
3140e751525SEric Saxe 		break;
3150e751525SEric Saxe 	default:
3160e751525SEric Saxe 		pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
3170e751525SEric Saxe 
3180e751525SEric Saxe 	}
3190e751525SEric Saxe }
3200e751525SEric Saxe 
3210e751525SEric Saxe /*
3220e751525SEric Saxe  * Promote PG above it's current parent.
3231a77c24bSEric Saxe  * This is only legal if PG has an equal or greater number of CPUs than its
3241a77c24bSEric Saxe  * parent.
3251a77c24bSEric Saxe  *
3261a77c24bSEric Saxe  * This routine operates on the CPU specific processor group data (for the CPUs
3271a77c24bSEric Saxe  * in the PG being promoted), and may be invoked from a context where one CPU's
3281a77c24bSEric Saxe  * PG data is under construction. In this case the argument "pgdata", if not
3291a77c24bSEric Saxe  * NULL, is a reference to the CPU's under-construction PG data.
3300e751525SEric Saxe  */
3310e751525SEric Saxe static void
3321a77c24bSEric Saxe cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
3330e751525SEric Saxe {
3340e751525SEric Saxe 	pg_cmt_t	*parent;
3350e751525SEric Saxe 	group_t		*children;
3360e751525SEric Saxe 	cpu_t		*cpu;
3370e751525SEric Saxe 	group_iter_t	iter;
3380e751525SEric Saxe 	pg_cpu_itr_t	cpu_iter;
3390e751525SEric Saxe 	int		r;
3400e751525SEric Saxe 	int		err;
341b025faeeSEric Saxe 	int		nchildren;
3420e751525SEric Saxe 
3430e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
3440e751525SEric Saxe 
3450e751525SEric Saxe 	parent = pg->cmt_parent;
3460e751525SEric Saxe 	if (parent == NULL) {
3470e751525SEric Saxe 		/*
3480e751525SEric Saxe 		 * Nothing to do
3490e751525SEric Saxe 		 */
3500e751525SEric Saxe 		return;
3510e751525SEric Saxe 	}
3520e751525SEric Saxe 
3530e751525SEric Saxe 	ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
3540e751525SEric Saxe 
3550e751525SEric Saxe 	/*
3560e751525SEric Saxe 	 * We're changing around the hierarchy, which is actively traversed
3570e751525SEric Saxe 	 * by the dispatcher. Pause CPUS to ensure exclusivity.
3580e751525SEric Saxe 	 */
3590e751525SEric Saxe 	pause_cpus(NULL);
3600e751525SEric Saxe 
3610e751525SEric Saxe 	/*
3620e751525SEric Saxe 	 * If necessary, update the parent's sibling set, replacing parent
3630e751525SEric Saxe 	 * with PG.
3640e751525SEric Saxe 	 */
3650e751525SEric Saxe 	if (parent->cmt_siblings) {
3660e751525SEric Saxe 		if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
3670e751525SEric Saxe 		    != -1) {
3680e751525SEric Saxe 			r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
3690e751525SEric Saxe 			ASSERT(r != -1);
3700e751525SEric Saxe 		}
3710e751525SEric Saxe 	}
3720e751525SEric Saxe 
3730e751525SEric Saxe 	/*
3740e751525SEric Saxe 	 * If the parent is at the top of the hierarchy, replace it's entry
3750e751525SEric Saxe 	 * in the root lgroup's group of top level PGs.
3760e751525SEric Saxe 	 */
3770e751525SEric Saxe 	if (parent->cmt_parent == NULL &&
3780e751525SEric Saxe 	    parent->cmt_siblings != &cmt_root->cl_pgs) {
3790e751525SEric Saxe 		if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
3800e751525SEric Saxe 		    != -1) {
3810e751525SEric Saxe 			r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
3820e751525SEric Saxe 			ASSERT(r != -1);
3830e751525SEric Saxe 		}
3840e751525SEric Saxe 	}
3850e751525SEric Saxe 
3860e751525SEric Saxe 	/*
3870e751525SEric Saxe 	 * We assume (and therefore assert) that the PG being promoted is an
3880e751525SEric Saxe 	 * only child of it's parent. Update the parent's children set
3890e751525SEric Saxe 	 * replacing PG's entry with the parent (since the parent is becoming
390b025faeeSEric Saxe 	 * the child). Then have PG and the parent swap children sets and
391b025faeeSEric Saxe 	 * children counts.
3920e751525SEric Saxe 	 */
3930e751525SEric Saxe 	ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
3940e751525SEric Saxe 	if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
3950e751525SEric Saxe 		r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
3960e751525SEric Saxe 		ASSERT(r != -1);
3970e751525SEric Saxe 	}
3980e751525SEric Saxe 
3990e751525SEric Saxe 	children = pg->cmt_children;
4000e751525SEric Saxe 	pg->cmt_children = parent->cmt_children;
4010e751525SEric Saxe 	parent->cmt_children = children;
4020e751525SEric Saxe 
403b025faeeSEric Saxe 	nchildren = pg->cmt_nchildren;
404b025faeeSEric Saxe 	pg->cmt_nchildren = parent->cmt_nchildren;
405b025faeeSEric Saxe 	parent->cmt_nchildren = nchildren;
406b025faeeSEric Saxe 
4070e751525SEric Saxe 	/*
4080e751525SEric Saxe 	 * Update the sibling references for PG and it's parent
4090e751525SEric Saxe 	 */
4100e751525SEric Saxe 	pg->cmt_siblings = parent->cmt_siblings;
4110e751525SEric Saxe 	parent->cmt_siblings = pg->cmt_children;
4120e751525SEric Saxe 
4130e751525SEric Saxe 	/*
4140e751525SEric Saxe 	 * Update any cached lineages in the per CPU pg data.
4150e751525SEric Saxe 	 */
4160e751525SEric Saxe 	PG_CPU_ITR_INIT(pg, cpu_iter);
4170e751525SEric Saxe 	while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
4180e751525SEric Saxe 		int		idx;
419b025faeeSEric Saxe 		int		sz;
4200e751525SEric Saxe 		pg_cmt_t	*cpu_pg;
4211a77c24bSEric Saxe 		cpu_pg_t	*pgd;	/* CPU's PG data */
4221a77c24bSEric Saxe 
4231a77c24bSEric Saxe 		/*
4241a77c24bSEric Saxe 		 * The CPU's whose lineage is under construction still
4251a77c24bSEric Saxe 		 * references the bootstrap CPU PG data structure.
4261a77c24bSEric Saxe 		 */
4271a77c24bSEric Saxe 		if (pg_cpu_is_bootstrapped(cpu))
4281a77c24bSEric Saxe 			pgd = pgdata;
4291a77c24bSEric Saxe 		else
4301a77c24bSEric Saxe 			pgd = cpu->cpu_pg;
4310e751525SEric Saxe 
4320e751525SEric Saxe 		/*
4330e751525SEric Saxe 		 * Iterate over the CPU's PGs updating the children
4340e751525SEric Saxe 		 * of the PG being promoted, since they have a new parent.
4350e751525SEric Saxe 		 */
4360e751525SEric Saxe 		group_iter_init(&iter);
4371a77c24bSEric Saxe 		while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
4380e751525SEric Saxe 			if (cpu_pg->cmt_parent == pg) {
4390e751525SEric Saxe 				cpu_pg->cmt_parent = parent;
4400e751525SEric Saxe 			}
4410e751525SEric Saxe 		}
4420e751525SEric Saxe 
4430e751525SEric Saxe 		/*
4440e751525SEric Saxe 		 * Update the CMT load balancing lineage
4450e751525SEric Saxe 		 */
4461a77c24bSEric Saxe 		if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
4470e751525SEric Saxe 			/*
4480e751525SEric Saxe 			 * Unless this is the CPU who's lineage is being
4490e751525SEric Saxe 			 * constructed, the PG being promoted should be
4500e751525SEric Saxe 			 * in the lineage.
4510e751525SEric Saxe 			 */
4521a77c24bSEric Saxe 			ASSERT(pg_cpu_is_bootstrapped(cpu));
4530e751525SEric Saxe 			continue;
4540e751525SEric Saxe 		}
4550e751525SEric Saxe 
4560e751525SEric Saxe 		ASSERT(idx > 0);
457b025faeeSEric Saxe 		ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);
4580e751525SEric Saxe 
4590e751525SEric Saxe 		/*
4600e751525SEric Saxe 		 * Have the child and the parent swap places in the CPU's
4610e751525SEric Saxe 		 * lineage
4620e751525SEric Saxe 		 */
4631a77c24bSEric Saxe 		group_remove_at(&pgd->cmt_pgs, idx);
4641a77c24bSEric Saxe 		group_remove_at(&pgd->cmt_pgs, idx - 1);
4651a77c24bSEric Saxe 		err = group_add_at(&pgd->cmt_pgs, parent, idx);
4660e751525SEric Saxe 		ASSERT(err == 0);
4671a77c24bSEric Saxe 		err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
4680e751525SEric Saxe 		ASSERT(err == 0);
469b025faeeSEric Saxe 
470b025faeeSEric Saxe 		/*
471b025faeeSEric Saxe 		 * Ensure cmt_lineage references CPU's leaf PG.
472b025faeeSEric Saxe 		 * Since cmt_pgs is top-down ordered, the bottom is the last
473b025faeeSEric Saxe 		 * element.
474b025faeeSEric Saxe 		 */
475b025faeeSEric Saxe 		if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0)
476b025faeeSEric Saxe 			pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1);
4770e751525SEric Saxe 	}
4780e751525SEric Saxe 
4790e751525SEric Saxe 	/*
4800e751525SEric Saxe 	 * Update the parent references for PG and it's parent
4810e751525SEric Saxe 	 */
4820e751525SEric Saxe 	pg->cmt_parent = parent->cmt_parent;
4830e751525SEric Saxe 	parent->cmt_parent = pg;
4840e751525SEric Saxe 
4850e751525SEric Saxe 	start_cpus();
486fb2f18f8Sesaxe }
487fb2f18f8Sesaxe 
488fb2f18f8Sesaxe /*
489fb2f18f8Sesaxe  * CMT class callback for a new CPU entering the system
4901a77c24bSEric Saxe  *
4911a77c24bSEric Saxe  * This routine operates on the CPU specific processor group data (for the CPU
4921a77c24bSEric Saxe  * being initialized). The argument "pgdata" is a reference to the CPU's PG
4931a77c24bSEric Saxe  * data to be constructed.
4941a77c24bSEric Saxe  *
4951a77c24bSEric Saxe  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
4961a77c24bSEric Saxe  * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
4971a77c24bSEric Saxe  * calls must be careful to operate only on the "pgdata" argument, and not
4981a77c24bSEric Saxe  * cp->cpu_pg.
499fb2f18f8Sesaxe  */
500fb2f18f8Sesaxe static void
5011a77c24bSEric Saxe pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
502fb2f18f8Sesaxe {
503fb2f18f8Sesaxe 	pg_cmt_t	*pg;
504fb2f18f8Sesaxe 	group_t		*cmt_pgs;
5050e751525SEric Saxe 	int		levels, level;
506fb2f18f8Sesaxe 	pghw_type_t	hw;
507fb2f18f8Sesaxe 	pg_t		*pg_cache = NULL;
508fb2f18f8Sesaxe 	pg_cmt_t	*cpu_cmt_hier[PGHW_NUM_COMPONENTS];
509fb2f18f8Sesaxe 	lgrp_handle_t	lgrp_handle;
510fb2f18f8Sesaxe 	cmt_lgrp_t	*lgrp;
511ef4f35d8SEric Saxe 	cmt_lineage_validation_t	lineage_status;
512fb2f18f8Sesaxe 
513fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
5141a77c24bSEric Saxe 	ASSERT(pg_cpu_is_bootstrapped(cp));
515fb2f18f8Sesaxe 
5160e751525SEric Saxe 	if (cmt_sched_disabled)
5170e751525SEric Saxe 		return;
5180e751525SEric Saxe 
519fb2f18f8Sesaxe 	/*
520fb2f18f8Sesaxe 	 * A new CPU is coming into the system.
521fb2f18f8Sesaxe 	 * Interrogate the platform to see if the CPU
5220e751525SEric Saxe 	 * has any performance or efficiency relevant
5230e751525SEric Saxe 	 * sharing relationships
524fb2f18f8Sesaxe 	 */
5251a77c24bSEric Saxe 	cmt_pgs = &pgdata->cmt_pgs;
5261a77c24bSEric Saxe 	pgdata->cmt_lineage = NULL;
527fb2f18f8Sesaxe 
528fb2f18f8Sesaxe 	bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
5290e751525SEric Saxe 	levels = 0;
530fb2f18f8Sesaxe 	for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
531fb2f18f8Sesaxe 
5320e751525SEric Saxe 		pg_cmt_policy_t	policy;
5330e751525SEric Saxe 
534fb2f18f8Sesaxe 		/*
5350e751525SEric Saxe 		 * We're only interested in the hw sharing relationships
5360e751525SEric Saxe 		 * for which we know how to optimize.
537fb2f18f8Sesaxe 		 */
5380e751525SEric Saxe 		policy = pg_cmt_policy(hw);
5390e751525SEric Saxe 		if (policy == CMT_NO_POLICY ||
5400e751525SEric Saxe 		    pg_plat_hw_shared(cp, hw) == 0)
541fb2f18f8Sesaxe 			continue;
542fb2f18f8Sesaxe 
543fb2f18f8Sesaxe 		/*
544d0e93b69SEric Saxe 		 * We will still create the PGs for hardware sharing
545d0e93b69SEric Saxe 		 * relationships that have been blacklisted, but won't
546d0e93b69SEric Saxe 		 * implement CMT thread placement optimizations against them.
5470e751525SEric Saxe 		 */
548d0e93b69SEric Saxe 		if (cmt_hw_blacklisted[hw] == 1)
549d0e93b69SEric Saxe 			policy = CMT_NO_POLICY;
5500e751525SEric Saxe 
5510e751525SEric Saxe 		/*
552fb2f18f8Sesaxe 		 * Find (or create) the PG associated with
553fb2f18f8Sesaxe 		 * the hw sharing relationship in which cp
554fb2f18f8Sesaxe 		 * belongs.
555fb2f18f8Sesaxe 		 *
556fb2f18f8Sesaxe 		 * Determine if a suitable PG already
557fb2f18f8Sesaxe 		 * exists, or if one needs to be created.
558fb2f18f8Sesaxe 		 */
559fb2f18f8Sesaxe 		pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
560fb2f18f8Sesaxe 		if (pg == NULL) {
561fb2f18f8Sesaxe 			/*
562fb2f18f8Sesaxe 			 * Create a new one.
563fb2f18f8Sesaxe 			 * Initialize the common...
564fb2f18f8Sesaxe 			 */
565fb2f18f8Sesaxe 			pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
566fb2f18f8Sesaxe 
567fb2f18f8Sesaxe 			/* ... physical ... */
568fb2f18f8Sesaxe 			pghw_init((pghw_t *)pg, cp, hw);
569fb2f18f8Sesaxe 
570fb2f18f8Sesaxe 			/*
571fb2f18f8Sesaxe 			 * ... and CMT specific portions of the
572fb2f18f8Sesaxe 			 * structure.
573fb2f18f8Sesaxe 			 */
5740e751525SEric Saxe 			pg->cmt_policy = policy;
5750e751525SEric Saxe 
5760e751525SEric Saxe 			/* CMT event callbacks */
5770e751525SEric Saxe 			cmt_callback_init((pg_t *)pg);
5780e751525SEric Saxe 
579fb2f18f8Sesaxe 			bitset_init(&pg->cmt_cpus_actv_set);
580fb2f18f8Sesaxe 			group_create(&pg->cmt_cpus_actv);
581fb2f18f8Sesaxe 		} else {
582fb2f18f8Sesaxe 			ASSERT(IS_CMT_PG(pg));
583fb2f18f8Sesaxe 		}
584fb2f18f8Sesaxe 
585*b885580bSAlexander Kolbasov 		((pghw_t *)pg)->pghw_generation++;
586*b885580bSAlexander Kolbasov 
587fb2f18f8Sesaxe 		/* Add the CPU to the PG */
5881a77c24bSEric Saxe 		pg_cpu_add((pg_t *)pg, cp, pgdata);
589fb2f18f8Sesaxe 
590fb2f18f8Sesaxe 		/*
5916890d023SEric Saxe 		 * Ensure capacity of the active CPU group/bitset
592fb2f18f8Sesaxe 		 */
593fb2f18f8Sesaxe 		group_expand(&pg->cmt_cpus_actv,
594fb2f18f8Sesaxe 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
595fb2f18f8Sesaxe 
596fb2f18f8Sesaxe 		if (cp->cpu_seqid >=
597fb2f18f8Sesaxe 		    bitset_capacity(&pg->cmt_cpus_actv_set)) {
598fb2f18f8Sesaxe 			bitset_resize(&pg->cmt_cpus_actv_set,
599fb2f18f8Sesaxe 			    cp->cpu_seqid + 1);
600fb2f18f8Sesaxe 		}
601fb2f18f8Sesaxe 
602fb2f18f8Sesaxe 		/*
6030e751525SEric Saxe 		 * Build a lineage of CMT PGs for load balancing / coalescence
604fb2f18f8Sesaxe 		 */
6050e751525SEric Saxe 		if (policy & (CMT_BALANCE | CMT_COALESCE)) {
6060e751525SEric Saxe 			cpu_cmt_hier[levels++] = pg;
607fb2f18f8Sesaxe 		}
608fb2f18f8Sesaxe 
609fb2f18f8Sesaxe 		/* Cache this for later */
610fb2f18f8Sesaxe 		if (hw == PGHW_CACHE)
611fb2f18f8Sesaxe 			pg_cache = (pg_t *)pg;
612fb2f18f8Sesaxe 	}
613fb2f18f8Sesaxe 
6140e751525SEric Saxe 	group_expand(cmt_pgs, levels);
6156890d023SEric Saxe 
6166890d023SEric Saxe 	if (cmt_root == NULL)
6176890d023SEric Saxe 		cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
618fb2f18f8Sesaxe 
619fb2f18f8Sesaxe 	/*
6200e751525SEric Saxe 	 * Find the lgrp that encapsulates this CPU's CMT hierarchy
6216890d023SEric Saxe 	 */
6226890d023SEric Saxe 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
6236890d023SEric Saxe 	if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
6246890d023SEric Saxe 		lgrp = pg_cmt_lgrp_create(lgrp_handle);
6256890d023SEric Saxe 
6266890d023SEric Saxe 	/*
6270e751525SEric Saxe 	 * Ascendingly sort the PGs in the lineage by number of CPUs
6280e751525SEric Saxe 	 */
6290e751525SEric Saxe 	pg_cmt_hier_sort(cpu_cmt_hier, levels);
6300e751525SEric Saxe 
6310e751525SEric Saxe 	/*
6320e751525SEric Saxe 	 * Examine the lineage and validate it.
6330e751525SEric Saxe 	 * This routine will also try to fix the lineage along with the
6340e751525SEric Saxe 	 * rest of the PG hierarchy should it detect an issue.
6350e751525SEric Saxe 	 *
636ef4f35d8SEric Saxe 	 * If it returns anything other than VALID or REPAIRED, an
637ef4f35d8SEric Saxe 	 * unrecoverable error has occurred, and we cannot proceed.
6380e751525SEric Saxe 	 */
6391a77c24bSEric Saxe 	lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
640ef4f35d8SEric Saxe 	if ((lineage_status != CMT_LINEAGE_VALID) &&
6411a77c24bSEric Saxe 	    (lineage_status != CMT_LINEAGE_REPAIRED)) {
6421a77c24bSEric Saxe 		/*
6431a77c24bSEric Saxe 		 * In the case of an unrecoverable error where CMT scheduling
6441a77c24bSEric Saxe 		 * has been disabled, assert that the under construction CPU's
6451a77c24bSEric Saxe 		 * PG data has an empty CMT load balancing lineage.
6461a77c24bSEric Saxe 		 */
6471a77c24bSEric Saxe 		ASSERT((cmt_sched_disabled == 0) ||
6481a77c24bSEric Saxe 		    (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
6490e751525SEric Saxe 		return;
6501a77c24bSEric Saxe 	}
6510e751525SEric Saxe 
6520e751525SEric Saxe 	/*
6530e751525SEric Saxe 	 * For existing PGs in the lineage, verify that the parent is
6540e751525SEric Saxe 	 * correct, as the generation in the lineage may have changed
6550e751525SEric Saxe 	 * as a result of the sorting. Start the traversal at the top
6560e751525SEric Saxe 	 * of the lineage, moving down.
6570e751525SEric Saxe 	 */
6580e751525SEric Saxe 	for (level = levels - 1; level >= 0; ) {
6590e751525SEric Saxe 		int reorg;
6600e751525SEric Saxe 
6610e751525SEric Saxe 		reorg = 0;
6620e751525SEric Saxe 		pg = cpu_cmt_hier[level];
6630e751525SEric Saxe 
6640e751525SEric Saxe 		/*
6650e751525SEric Saxe 		 * Promote PGs at an incorrect generation into place.
6660e751525SEric Saxe 		 */
6670e751525SEric Saxe 		while (pg->cmt_parent &&
6680e751525SEric Saxe 		    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
6691a77c24bSEric Saxe 			cmt_hier_promote(pg, pgdata);
6700e751525SEric Saxe 			reorg++;
6710e751525SEric Saxe 		}
6720e751525SEric Saxe 		if (reorg > 0)
6730e751525SEric Saxe 			level = levels - 1;
6740e751525SEric Saxe 		else
6750e751525SEric Saxe 			level--;
6760e751525SEric Saxe 	}
6770e751525SEric Saxe 
6780e751525SEric Saxe 	/*
6796890d023SEric Saxe 	 * For each of the PGs in the CPU's lineage:
6800e751525SEric Saxe 	 *	- Add an entry in the CPU sorted CMT PG group
6810e751525SEric Saxe 	 *	  which is used for top down CMT load balancing
682fb2f18f8Sesaxe 	 *	- Tie the PG into the CMT hierarchy by connecting
683fb2f18f8Sesaxe 	 *	  it to it's parent and siblings.
684fb2f18f8Sesaxe 	 */
6850e751525SEric Saxe 	for (level = 0; level < levels; level++) {
686fb2f18f8Sesaxe 		uint_t		children;
687fb2f18f8Sesaxe 		int		err;
688fb2f18f8Sesaxe 
689fb2f18f8Sesaxe 		pg = cpu_cmt_hier[level];
6900e751525SEric Saxe 		err = group_add_at(cmt_pgs, pg, levels - level - 1);
691fb2f18f8Sesaxe 		ASSERT(err == 0);
692fb2f18f8Sesaxe 
693fb2f18f8Sesaxe 		if (level == 0)
6941a77c24bSEric Saxe 			pgdata->cmt_lineage = (pg_t *)pg;
695fb2f18f8Sesaxe 
696fb2f18f8Sesaxe 		if (pg->cmt_siblings != NULL) {
697fb2f18f8Sesaxe 			/* Already initialized */
698fb2f18f8Sesaxe 			ASSERT(pg->cmt_parent == NULL ||
699fb2f18f8Sesaxe 			    pg->cmt_parent == cpu_cmt_hier[level + 1]);
700fb2f18f8Sesaxe 			ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
701c416da2dSjb145095 			    ((pg->cmt_parent != NULL) &&
702c416da2dSjb145095 			    pg->cmt_siblings == pg->cmt_parent->cmt_children));
703fb2f18f8Sesaxe 			continue;
704fb2f18f8Sesaxe 		}
705fb2f18f8Sesaxe 
7060e751525SEric Saxe 		if ((level + 1) == levels) {
707fb2f18f8Sesaxe 			pg->cmt_parent = NULL;
7086890d023SEric Saxe 
709fb2f18f8Sesaxe 			pg->cmt_siblings = &lgrp->cl_pgs;
710fb2f18f8Sesaxe 			children = ++lgrp->cl_npgs;
7110e751525SEric Saxe 			if (cmt_root != lgrp)
7126890d023SEric Saxe 				cmt_root->cl_npgs++;
713fb2f18f8Sesaxe 		} else {
714fb2f18f8Sesaxe 			pg->cmt_parent = cpu_cmt_hier[level + 1];
715fb2f18f8Sesaxe 
716fb2f18f8Sesaxe 			/*
717fb2f18f8Sesaxe 			 * A good parent keeps track of their children.
718fb2f18f8Sesaxe 			 * The parent's children group is also the PG's
719fb2f18f8Sesaxe 			 * siblings.
720fb2f18f8Sesaxe 			 */
721fb2f18f8Sesaxe 			if (pg->cmt_parent->cmt_children == NULL) {
722fb2f18f8Sesaxe 				pg->cmt_parent->cmt_children =
723fb2f18f8Sesaxe 				    kmem_zalloc(sizeof (group_t), KM_SLEEP);
724fb2f18f8Sesaxe 				group_create(pg->cmt_parent->cmt_children);
725fb2f18f8Sesaxe 			}
726fb2f18f8Sesaxe 			pg->cmt_siblings = pg->cmt_parent->cmt_children;
727fb2f18f8Sesaxe 			children = ++pg->cmt_parent->cmt_nchildren;
728fb2f18f8Sesaxe 		}
7296890d023SEric Saxe 
730fb2f18f8Sesaxe 		group_expand(pg->cmt_siblings, children);
7316890d023SEric Saxe 		group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
732fb2f18f8Sesaxe 	}
733fb2f18f8Sesaxe 
734fb2f18f8Sesaxe 	/*
735fb2f18f8Sesaxe 	 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
736fb2f18f8Sesaxe 	 * for fast lookups later.
737fb2f18f8Sesaxe 	 */
738fb2f18f8Sesaxe 	if (cp->cpu_physid) {
739fb2f18f8Sesaxe 		cp->cpu_physid->cpu_chipid =
740fb2f18f8Sesaxe 		    pg_plat_hw_instance_id(cp, PGHW_CHIP);
741fb2f18f8Sesaxe 		cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
742fb2f18f8Sesaxe 
743fb2f18f8Sesaxe 		/*
744fb2f18f8Sesaxe 		 * If this cpu has a PG representing shared cache, then set
745fb2f18f8Sesaxe 		 * cpu_cacheid to that PG's logical id
746fb2f18f8Sesaxe 		 */
747fb2f18f8Sesaxe 		if (pg_cache)
748fb2f18f8Sesaxe 			cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
749fb2f18f8Sesaxe 	}
750fb2f18f8Sesaxe 
751fb2f18f8Sesaxe 	/* CPU0 only initialization */
752fb2f18f8Sesaxe 	if (is_cpu0) {
753fb2f18f8Sesaxe 		is_cpu0 = 0;
754a6604450Sesaxe 		cpu0_lgrp = lgrp;
755fb2f18f8Sesaxe 	}
756fb2f18f8Sesaxe 
757fb2f18f8Sesaxe }
758fb2f18f8Sesaxe 
759fb2f18f8Sesaxe /*
760fb2f18f8Sesaxe  * Class callback when a CPU is leaving the system (deletion)
7611a77c24bSEric Saxe  *
7621a77c24bSEric Saxe  * "pgdata" is a reference to the CPU's PG data to be deconstructed.
7631a77c24bSEric Saxe  *
7641a77c24bSEric Saxe  * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
7651a77c24bSEric Saxe  * references a "bootstrap" structure across this function's invocation.
766*b885580bSAlexander Kolbasov  * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
7671a77c24bSEric Saxe  * on the "pgdata" argument, and not cp->cpu_pg.
768fb2f18f8Sesaxe  */
769fb2f18f8Sesaxe static void
7701a77c24bSEric Saxe pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
771fb2f18f8Sesaxe {
772fb2f18f8Sesaxe 	group_iter_t	i;
773fb2f18f8Sesaxe 	pg_cmt_t	*pg;
774fb2f18f8Sesaxe 	group_t		*pgs, *cmt_pgs;
775fb2f18f8Sesaxe 	lgrp_handle_t	lgrp_handle;
776fb2f18f8Sesaxe 	cmt_lgrp_t	*lgrp;
777fb2f18f8Sesaxe 
7780e751525SEric Saxe 	if (cmt_sched_disabled)
7790e751525SEric Saxe 		return;
7800e751525SEric Saxe 
7811a77c24bSEric Saxe 	ASSERT(pg_cpu_is_bootstrapped(cp));
7821a77c24bSEric Saxe 
7831a77c24bSEric Saxe 	pgs = &pgdata->pgs;
7841a77c24bSEric Saxe 	cmt_pgs = &pgdata->cmt_pgs;
785fb2f18f8Sesaxe 
786fb2f18f8Sesaxe 	/*
787fb2f18f8Sesaxe 	 * Find the lgroup that encapsulates this CPU's CMT hierarchy
788fb2f18f8Sesaxe 	 */
789fb2f18f8Sesaxe 	lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
790a6604450Sesaxe 
791fb2f18f8Sesaxe 	lgrp = pg_cmt_find_lgrp(lgrp_handle);
7923e81cacfSEric Saxe 	if (ncpus == 1 && lgrp != cpu0_lgrp) {
793a6604450Sesaxe 		/*
7943e81cacfSEric Saxe 		 * One might wonder how we could be deconfiguring the
7953e81cacfSEric Saxe 		 * only CPU in the system.
796a6604450Sesaxe 		 *
7973e81cacfSEric Saxe 		 * On Starcat systems when null_proc_lpa is detected,
7983e81cacfSEric Saxe 		 * the boot CPU (which is already configured into a leaf
7993e81cacfSEric Saxe 		 * lgroup), is moved into the root lgroup. This is done by
8003e81cacfSEric Saxe 		 * deconfiguring it from both lgroups and processor
8013e81cacfSEric Saxe 		 * groups), and then later reconfiguring it back in.  This
8023e81cacfSEric Saxe 		 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
8033e81cacfSEric Saxe 		 *
8043e81cacfSEric Saxe 		 * This special case is detected by noting that the platform
8053e81cacfSEric Saxe 		 * has changed the CPU's lgrp affiliation (since it now
8063e81cacfSEric Saxe 		 * belongs in the root). In this case, use the cmt_lgrp_t
8073e81cacfSEric Saxe 		 * cached for the boot CPU, since this is what needs to be
8083e81cacfSEric Saxe 		 * torn down.
809a6604450Sesaxe 		 */
810a6604450Sesaxe 		lgrp = cpu0_lgrp;
811a6604450Sesaxe 	}
812fb2f18f8Sesaxe 
8133e81cacfSEric Saxe 	ASSERT(lgrp != NULL);
8143e81cacfSEric Saxe 
815fb2f18f8Sesaxe 	/*
816fb2f18f8Sesaxe 	 * First, clean up anything load balancing specific for each of
817fb2f18f8Sesaxe 	 * the CPU's PGs that participated in CMT load balancing
818fb2f18f8Sesaxe 	 */
8191a77c24bSEric Saxe 	pg = (pg_cmt_t *)pgdata->cmt_lineage;
820fb2f18f8Sesaxe 	while (pg != NULL) {
821fb2f18f8Sesaxe 
822*b885580bSAlexander Kolbasov 		((pghw_t *)pg)->pghw_generation++;
823*b885580bSAlexander Kolbasov 
824fb2f18f8Sesaxe 		/*
825fb2f18f8Sesaxe 		 * Remove the PG from the CPU's load balancing lineage
826fb2f18f8Sesaxe 		 */
827fb2f18f8Sesaxe 		(void) group_remove(cmt_pgs, pg, GRP_RESIZE);
828fb2f18f8Sesaxe 
829fb2f18f8Sesaxe 		/*
830fb2f18f8Sesaxe 		 * If it's about to become empty, destroy it's children
831fb2f18f8Sesaxe 		 * group, and remove it's reference from it's siblings.
832fb2f18f8Sesaxe 		 * This is done here (rather than below) to avoid removing
833fb2f18f8Sesaxe 		 * our reference from a PG that we just eliminated.
834fb2f18f8Sesaxe 		 */
835fb2f18f8Sesaxe 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
836fb2f18f8Sesaxe 			if (pg->cmt_children != NULL)
837fb2f18f8Sesaxe 				group_destroy(pg->cmt_children);
838fb2f18f8Sesaxe 			if (pg->cmt_siblings != NULL) {
839fb2f18f8Sesaxe 				if (pg->cmt_siblings == &lgrp->cl_pgs)
840fb2f18f8Sesaxe 					lgrp->cl_npgs--;
841fb2f18f8Sesaxe 				else
842fb2f18f8Sesaxe 					pg->cmt_parent->cmt_nchildren--;
843fb2f18f8Sesaxe 			}
844fb2f18f8Sesaxe 		}
845fb2f18f8Sesaxe 		pg = pg->cmt_parent;
846fb2f18f8Sesaxe 	}
847fb2f18f8Sesaxe 	ASSERT(GROUP_SIZE(cmt_pgs) == 0);
848fb2f18f8Sesaxe 
849fb2f18f8Sesaxe 	/*
850fb2f18f8Sesaxe 	 * Now that the load balancing lineage updates have happened,
851fb2f18f8Sesaxe 	 * remove the CPU from all it's PGs (destroying any that become
852fb2f18f8Sesaxe 	 * empty).
853fb2f18f8Sesaxe 	 */
854fb2f18f8Sesaxe 	group_iter_init(&i);
855fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
856fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
857fb2f18f8Sesaxe 			continue;
858fb2f18f8Sesaxe 
8591a77c24bSEric Saxe 		pg_cpu_delete((pg_t *)pg, cp, pgdata);
860fb2f18f8Sesaxe 		/*
861fb2f18f8Sesaxe 		 * Deleting the CPU from the PG changes the CPU's
862fb2f18f8Sesaxe 		 * PG group over which we are actively iterating
863fb2f18f8Sesaxe 		 * Re-initialize the iteration
864fb2f18f8Sesaxe 		 */
865fb2f18f8Sesaxe 		group_iter_init(&i);
866fb2f18f8Sesaxe 
867fb2f18f8Sesaxe 		if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
868fb2f18f8Sesaxe 
869fb2f18f8Sesaxe 			/*
870fb2f18f8Sesaxe 			 * The PG has become zero sized, so destroy it.
871fb2f18f8Sesaxe 			 */
872fb2f18f8Sesaxe 			group_destroy(&pg->cmt_cpus_actv);
873fb2f18f8Sesaxe 			bitset_fini(&pg->cmt_cpus_actv_set);
874fb2f18f8Sesaxe 			pghw_fini((pghw_t *)pg);
875fb2f18f8Sesaxe 
876fb2f18f8Sesaxe 			pg_destroy((pg_t *)pg);
877fb2f18f8Sesaxe 		}
878fb2f18f8Sesaxe 	}
879fb2f18f8Sesaxe }
880fb2f18f8Sesaxe 
881fb2f18f8Sesaxe /*
882fb2f18f8Sesaxe  * Class callback when a CPU is entering a cpu partition
883fb2f18f8Sesaxe  */
884fb2f18f8Sesaxe static void
885fb2f18f8Sesaxe pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
886fb2f18f8Sesaxe {
887fb2f18f8Sesaxe 	group_t		*pgs;
888fb2f18f8Sesaxe 	pg_t		*pg;
889fb2f18f8Sesaxe 	group_iter_t	i;
890fb2f18f8Sesaxe 
891fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
892fb2f18f8Sesaxe 
8930e751525SEric Saxe 	if (cmt_sched_disabled)
8940e751525SEric Saxe 		return;
8950e751525SEric Saxe 
896fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
897fb2f18f8Sesaxe 
898fb2f18f8Sesaxe 	/*
899fb2f18f8Sesaxe 	 * Ensure that the new partition's PG bitset
900fb2f18f8Sesaxe 	 * is large enough for all CMT PG's to which cp
901fb2f18f8Sesaxe 	 * belongs
902fb2f18f8Sesaxe 	 */
903fb2f18f8Sesaxe 	group_iter_init(&i);
904fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
905fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
906fb2f18f8Sesaxe 			continue;
907fb2f18f8Sesaxe 
908fb2f18f8Sesaxe 		if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
909fb2f18f8Sesaxe 			bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
910fb2f18f8Sesaxe 	}
911fb2f18f8Sesaxe }
912fb2f18f8Sesaxe 
913fb2f18f8Sesaxe /*
914fb2f18f8Sesaxe  * Class callback when a CPU is actually moving partitions
915fb2f18f8Sesaxe  */
916fb2f18f8Sesaxe static void
917fb2f18f8Sesaxe pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
918fb2f18f8Sesaxe {
919fb2f18f8Sesaxe 	cpu_t		*cpp;
920fb2f18f8Sesaxe 	group_t		*pgs;
921fb2f18f8Sesaxe 	pg_t		*pg;
922fb2f18f8Sesaxe 	group_iter_t	pg_iter;
923fb2f18f8Sesaxe 	pg_cpu_itr_t	cpu_iter;
924fb2f18f8Sesaxe 	boolean_t	found;
925fb2f18f8Sesaxe 
926fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
927fb2f18f8Sesaxe 
9280e751525SEric Saxe 	if (cmt_sched_disabled)
9290e751525SEric Saxe 		return;
9300e751525SEric Saxe 
931fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
932fb2f18f8Sesaxe 	group_iter_init(&pg_iter);
933fb2f18f8Sesaxe 
934fb2f18f8Sesaxe 	/*
935fb2f18f8Sesaxe 	 * Iterate over the CPUs CMT PGs
936fb2f18f8Sesaxe 	 */
937fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
938fb2f18f8Sesaxe 
939fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
940fb2f18f8Sesaxe 			continue;
941fb2f18f8Sesaxe 
942fb2f18f8Sesaxe 		/*
943fb2f18f8Sesaxe 		 * Add the PG to the bitset in the new partition.
944fb2f18f8Sesaxe 		 */
945fb2f18f8Sesaxe 		bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
946fb2f18f8Sesaxe 
947fb2f18f8Sesaxe 		/*
948fb2f18f8Sesaxe 		 * Remove the PG from the bitset in the old partition
949fb2f18f8Sesaxe 		 * if the last of the PG's CPUs have left.
950fb2f18f8Sesaxe 		 */
951fb2f18f8Sesaxe 		found = B_FALSE;
952fb2f18f8Sesaxe 		PG_CPU_ITR_INIT(pg, cpu_iter);
953fb2f18f8Sesaxe 		while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
954fb2f18f8Sesaxe 			if (cpp == cp)
955fb2f18f8Sesaxe 				continue;
956a6604450Sesaxe 			if (CPU_ACTIVE(cpp) &&
957a6604450Sesaxe 			    cpp->cpu_part->cp_id == oldpp->cp_id) {
958fb2f18f8Sesaxe 				found = B_TRUE;
959fb2f18f8Sesaxe 				break;
960fb2f18f8Sesaxe 			}
961fb2f18f8Sesaxe 		}
962fb2f18f8Sesaxe 		if (!found)
963fb2f18f8Sesaxe 			bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
964fb2f18f8Sesaxe 	}
965fb2f18f8Sesaxe }
966fb2f18f8Sesaxe 
967fb2f18f8Sesaxe /*
968fb2f18f8Sesaxe  * Class callback when a CPU becomes active (online)
969fb2f18f8Sesaxe  *
970fb2f18f8Sesaxe  * This is called in a context where CPUs are paused
971fb2f18f8Sesaxe  */
972fb2f18f8Sesaxe static void
973fb2f18f8Sesaxe pg_cmt_cpu_active(cpu_t *cp)
974fb2f18f8Sesaxe {
975fb2f18f8Sesaxe 	int		err;
976fb2f18f8Sesaxe 	group_iter_t	i;
977fb2f18f8Sesaxe 	pg_cmt_t	*pg;
978fb2f18f8Sesaxe 	group_t		*pgs;
979fb2f18f8Sesaxe 
980fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
981fb2f18f8Sesaxe 
9820e751525SEric Saxe 	if (cmt_sched_disabled)
9830e751525SEric Saxe 		return;
9840e751525SEric Saxe 
985fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
986fb2f18f8Sesaxe 	group_iter_init(&i);
987fb2f18f8Sesaxe 
988fb2f18f8Sesaxe 	/*
989fb2f18f8Sesaxe 	 * Iterate over the CPU's PGs
990fb2f18f8Sesaxe 	 */
991fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
992fb2f18f8Sesaxe 
993fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
994fb2f18f8Sesaxe 			continue;
995fb2f18f8Sesaxe 
996*b885580bSAlexander Kolbasov 		/*
997*b885580bSAlexander Kolbasov 		 * Move to the next generation since topology is changing
998*b885580bSAlexander Kolbasov 		 */
999*b885580bSAlexander Kolbasov 		((pghw_t *)pg)->pghw_generation++;
1000*b885580bSAlexander Kolbasov 
1001fb2f18f8Sesaxe 		err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1002fb2f18f8Sesaxe 		ASSERT(err == 0);
1003fb2f18f8Sesaxe 
1004fb2f18f8Sesaxe 		/*
1005fb2f18f8Sesaxe 		 * If this is the first active CPU in the PG, and it
1006fb2f18f8Sesaxe 		 * represents a hardware sharing relationship over which
1007fb2f18f8Sesaxe 		 * CMT load balancing is performed, add it as a candidate
1008fb2f18f8Sesaxe 		 * for balancing with it's siblings.
1009fb2f18f8Sesaxe 		 */
1010fb2f18f8Sesaxe 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
10110e751525SEric Saxe 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1012fb2f18f8Sesaxe 			err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
1013fb2f18f8Sesaxe 			ASSERT(err == 0);
10146890d023SEric Saxe 
10156890d023SEric Saxe 			/*
10166890d023SEric Saxe 			 * If this is a top level PG, add it as a balancing
10170e751525SEric Saxe 			 * candidate when balancing within the root lgroup.
10186890d023SEric Saxe 			 */
10190e751525SEric Saxe 			if (pg->cmt_parent == NULL &&
10200e751525SEric Saxe 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
10216890d023SEric Saxe 				err = group_add(&cmt_root->cl_pgs, pg,
10226890d023SEric Saxe 				    GRP_NORESIZE);
10236890d023SEric Saxe 				ASSERT(err == 0);
10246890d023SEric Saxe 			}
1025fb2f18f8Sesaxe 		}
1026fb2f18f8Sesaxe 
1027fb2f18f8Sesaxe 		/*
1028fb2f18f8Sesaxe 		 * Notate the CPU in the PGs active CPU bitset.
1029fb2f18f8Sesaxe 		 * Also notate the PG as being active in it's associated
1030fb2f18f8Sesaxe 		 * partition
1031fb2f18f8Sesaxe 		 */
1032fb2f18f8Sesaxe 		bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1033fb2f18f8Sesaxe 		bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
1034fb2f18f8Sesaxe 	}
1035fb2f18f8Sesaxe }
1036fb2f18f8Sesaxe 
1037fb2f18f8Sesaxe /*
1038fb2f18f8Sesaxe  * Class callback when a CPU goes inactive (offline)
1039fb2f18f8Sesaxe  *
1040fb2f18f8Sesaxe  * This is called in a context where CPUs are paused
1041fb2f18f8Sesaxe  */
1042fb2f18f8Sesaxe static void
1043fb2f18f8Sesaxe pg_cmt_cpu_inactive(cpu_t *cp)
1044fb2f18f8Sesaxe {
1045fb2f18f8Sesaxe 	int		err;
1046fb2f18f8Sesaxe 	group_t		*pgs;
1047fb2f18f8Sesaxe 	pg_cmt_t	*pg;
1048fb2f18f8Sesaxe 	cpu_t		*cpp;
1049fb2f18f8Sesaxe 	group_iter_t	i;
1050fb2f18f8Sesaxe 	pg_cpu_itr_t	cpu_itr;
1051fb2f18f8Sesaxe 	boolean_t	found;
1052fb2f18f8Sesaxe 
1053fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
1054fb2f18f8Sesaxe 
10550e751525SEric Saxe 	if (cmt_sched_disabled)
10560e751525SEric Saxe 		return;
10570e751525SEric Saxe 
1058fb2f18f8Sesaxe 	pgs = &cp->cpu_pg->pgs;
1059fb2f18f8Sesaxe 	group_iter_init(&i);
1060fb2f18f8Sesaxe 
1061fb2f18f8Sesaxe 	while ((pg = group_iterate(pgs, &i)) != NULL) {
1062fb2f18f8Sesaxe 
1063fb2f18f8Sesaxe 		if (IS_CMT_PG(pg) == 0)
1064fb2f18f8Sesaxe 			continue;
1065fb2f18f8Sesaxe 
1066fb2f18f8Sesaxe 		/*
1067*b885580bSAlexander Kolbasov 		 * Move to the next generation since topology is changing
1068*b885580bSAlexander Kolbasov 		 */
1069*b885580bSAlexander Kolbasov 		((pghw_t *)pg)->pghw_generation++;
1070*b885580bSAlexander Kolbasov 
1071*b885580bSAlexander Kolbasov 		/*
1072fb2f18f8Sesaxe 		 * Remove the CPU from the CMT PGs active CPU group
1073fb2f18f8Sesaxe 		 * bitmap
1074fb2f18f8Sesaxe 		 */
1075fb2f18f8Sesaxe 		err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1076fb2f18f8Sesaxe 		ASSERT(err == 0);
1077fb2f18f8Sesaxe 
1078fb2f18f8Sesaxe 		bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1079fb2f18f8Sesaxe 
1080fb2f18f8Sesaxe 		/*
1081fb2f18f8Sesaxe 		 * If there are no more active CPUs in this PG over which
1082fb2f18f8Sesaxe 		 * load was balanced, remove it as a balancing candidate.
1083fb2f18f8Sesaxe 		 */
1084fb2f18f8Sesaxe 		if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
10850e751525SEric Saxe 		    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1086fb2f18f8Sesaxe 			err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1087fb2f18f8Sesaxe 			ASSERT(err == 0);
10886890d023SEric Saxe 
10890e751525SEric Saxe 			if (pg->cmt_parent == NULL &&
10900e751525SEric Saxe 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
10916890d023SEric Saxe 				err = group_remove(&cmt_root->cl_pgs, pg,
10926890d023SEric Saxe 				    GRP_NORESIZE);
10936890d023SEric Saxe 				ASSERT(err == 0);
10946890d023SEric Saxe 			}
1095fb2f18f8Sesaxe 		}
1096fb2f18f8Sesaxe 
1097fb2f18f8Sesaxe 		/*
1098fb2f18f8Sesaxe 		 * Assert the number of active CPUs does not exceed
1099fb2f18f8Sesaxe 		 * the total number of CPUs in the PG
1100fb2f18f8Sesaxe 		 */
1101fb2f18f8Sesaxe 		ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1102fb2f18f8Sesaxe 		    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1103fb2f18f8Sesaxe 
1104fb2f18f8Sesaxe 		/*
1105fb2f18f8Sesaxe 		 * Update the PG bitset in the CPU's old partition
1106fb2f18f8Sesaxe 		 */
1107fb2f18f8Sesaxe 		found = B_FALSE;
1108fb2f18f8Sesaxe 		PG_CPU_ITR_INIT(pg, cpu_itr);
1109fb2f18f8Sesaxe 		while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1110fb2f18f8Sesaxe 			if (cpp == cp)
1111fb2f18f8Sesaxe 				continue;
1112a6604450Sesaxe 			if (CPU_ACTIVE(cpp) &&
1113a6604450Sesaxe 			    cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1114fb2f18f8Sesaxe 				found = B_TRUE;
1115fb2f18f8Sesaxe 				break;
1116fb2f18f8Sesaxe 			}
1117fb2f18f8Sesaxe 		}
1118fb2f18f8Sesaxe 		if (!found) {
1119fb2f18f8Sesaxe 			bitset_del(&cp->cpu_part->cp_cmt_pgs,
1120fb2f18f8Sesaxe 			    ((pg_t *)pg)->pg_id);
1121fb2f18f8Sesaxe 		}
1122fb2f18f8Sesaxe 	}
1123fb2f18f8Sesaxe }
1124fb2f18f8Sesaxe 
1125fb2f18f8Sesaxe /*
1126fb2f18f8Sesaxe  * Return non-zero if the CPU belongs in the given PG
1127fb2f18f8Sesaxe  */
1128fb2f18f8Sesaxe static int
1129fb2f18f8Sesaxe pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1130fb2f18f8Sesaxe {
1131fb2f18f8Sesaxe 	cpu_t	*pg_cpu;
1132fb2f18f8Sesaxe 
1133fb2f18f8Sesaxe 	pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1134fb2f18f8Sesaxe 
1135fb2f18f8Sesaxe 	ASSERT(pg_cpu != NULL);
1136fb2f18f8Sesaxe 
1137fb2f18f8Sesaxe 	/*
1138fb2f18f8Sesaxe 	 * The CPU belongs if, given the nature of the hardware sharing
1139fb2f18f8Sesaxe 	 * relationship represented by the PG, the CPU has that
1140fb2f18f8Sesaxe 	 * relationship with some other CPU already in the PG
1141fb2f18f8Sesaxe 	 */
1142fb2f18f8Sesaxe 	if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1143fb2f18f8Sesaxe 		return (1);
1144fb2f18f8Sesaxe 
1145fb2f18f8Sesaxe 	return (0);
1146fb2f18f8Sesaxe }
1147fb2f18f8Sesaxe 
1148fb2f18f8Sesaxe /*
11490e751525SEric Saxe  * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1150fb2f18f8Sesaxe  */
1151fb2f18f8Sesaxe static void
11520e751525SEric Saxe pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1153fb2f18f8Sesaxe {
11548031591dSSrihari Venkatesan 	int		i, j, inc, sz;
11558031591dSSrihari Venkatesan 	int		start, end;
11560e751525SEric Saxe 	pg_t		*tmp;
11570e751525SEric Saxe 	pg_t		**h = (pg_t **)hier;
1158fb2f18f8Sesaxe 
11590e751525SEric Saxe 	/*
11600e751525SEric Saxe 	 * First sort by number of CPUs
11610e751525SEric Saxe 	 */
11620e751525SEric Saxe 	inc = size / 2;
11630e751525SEric Saxe 	while (inc > 0) {
11640e751525SEric Saxe 		for (i = inc; i < size; i++) {
11650e751525SEric Saxe 			j = i;
11660e751525SEric Saxe 			tmp = h[i];
11670e751525SEric Saxe 			while ((j >= inc) &&
11680e751525SEric Saxe 			    (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
11690e751525SEric Saxe 				h[j] = h[j - inc];
11700e751525SEric Saxe 				j = j - inc;
11710e751525SEric Saxe 			}
11720e751525SEric Saxe 			h[j] = tmp;
11730e751525SEric Saxe 		}
11740e751525SEric Saxe 		if (inc == 2)
11750e751525SEric Saxe 			inc = 1;
11760e751525SEric Saxe 		else
11770e751525SEric Saxe 			inc = (inc * 5) / 11;
11780e751525SEric Saxe 	}
1179fb2f18f8Sesaxe 
11800e751525SEric Saxe 	/*
11810e751525SEric Saxe 	 * Break ties by asking the platform.
11820e751525SEric Saxe 	 * Determine if h[i] outranks h[i + 1] and if so, swap them.
11830e751525SEric Saxe 	 */
11848031591dSSrihari Venkatesan 	for (start = 0; start < size; start++) {
11858031591dSSrihari Venkatesan 
11868031591dSSrihari Venkatesan 		/*
11878031591dSSrihari Venkatesan 		 * Find various contiguous sets of elements,
11888031591dSSrihari Venkatesan 		 * in the array, with the same number of cpus
11898031591dSSrihari Venkatesan 		 */
11908031591dSSrihari Venkatesan 		end = start;
11918031591dSSrihari Venkatesan 		sz = PG_NUM_CPUS(h[start]);
11928031591dSSrihari Venkatesan 		while ((end < size) && (sz == PG_NUM_CPUS(h[end])))
11938031591dSSrihari Venkatesan 			end++;
11948031591dSSrihari Venkatesan 		/*
11958031591dSSrihari Venkatesan 		 * Sort each such set of the array by rank
11968031591dSSrihari Venkatesan 		 */
11978031591dSSrihari Venkatesan 		for (i = start + 1; i < end; i++) {
11988031591dSSrihari Venkatesan 			j = i - 1;
11990e751525SEric Saxe 			tmp = h[i];
12008031591dSSrihari Venkatesan 			while (j >= start &&
12018031591dSSrihari Venkatesan 			    pg_cmt_hier_rank(hier[j],
12028031591dSSrihari Venkatesan 			    (pg_cmt_t *)tmp) == hier[j]) {
12038031591dSSrihari Venkatesan 				h[j + 1] = h[j];
12048031591dSSrihari Venkatesan 				j--;
12058031591dSSrihari Venkatesan 			}
12068031591dSSrihari Venkatesan 			h[j + 1] = tmp;
1207fb2f18f8Sesaxe 		}
1208fb2f18f8Sesaxe 	}
1209fb2f18f8Sesaxe }
1210fb2f18f8Sesaxe 
1211fb2f18f8Sesaxe /*
1212fb2f18f8Sesaxe  * Return a cmt_lgrp_t * given an lgroup handle.
1213fb2f18f8Sesaxe  */
1214fb2f18f8Sesaxe static cmt_lgrp_t *
1215fb2f18f8Sesaxe pg_cmt_find_lgrp(lgrp_handle_t hand)
1216fb2f18f8Sesaxe {
1217fb2f18f8Sesaxe 	cmt_lgrp_t	*lgrp;
1218fb2f18f8Sesaxe 
1219fb2f18f8Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
1220fb2f18f8Sesaxe 
1221fb2f18f8Sesaxe 	lgrp = cmt_lgrps;
1222fb2f18f8Sesaxe 	while (lgrp != NULL) {
1223fb2f18f8Sesaxe 		if (lgrp->cl_hand == hand)
1224a6604450Sesaxe 			break;
1225fb2f18f8Sesaxe 		lgrp = lgrp->cl_next;
1226fb2f18f8Sesaxe 	}
1227a6604450Sesaxe 	return (lgrp);
1228a6604450Sesaxe }
1229fb2f18f8Sesaxe 
1230fb2f18f8Sesaxe /*
1231a6604450Sesaxe  * Create a cmt_lgrp_t with the specified handle.
1232fb2f18f8Sesaxe  */
1233a6604450Sesaxe static cmt_lgrp_t *
1234a6604450Sesaxe pg_cmt_lgrp_create(lgrp_handle_t hand)
1235a6604450Sesaxe {
1236a6604450Sesaxe 	cmt_lgrp_t	*lgrp;
1237a6604450Sesaxe 
1238a6604450Sesaxe 	ASSERT(MUTEX_HELD(&cpu_lock));
1239a6604450Sesaxe 
1240fb2f18f8Sesaxe 	lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1241fb2f18f8Sesaxe 
1242fb2f18f8Sesaxe 	lgrp->cl_hand = hand;
1243fb2f18f8Sesaxe 	lgrp->cl_npgs = 0;
1244fb2f18f8Sesaxe 	lgrp->cl_next = cmt_lgrps;
1245fb2f18f8Sesaxe 	cmt_lgrps = lgrp;
1246fb2f18f8Sesaxe 	group_create(&lgrp->cl_pgs);
1247fb2f18f8Sesaxe 
1248fb2f18f8Sesaxe 	return (lgrp);
1249fb2f18f8Sesaxe }
12506890d023SEric Saxe 
12516890d023SEric Saxe /*
12520e751525SEric Saxe  * Interfaces to enable and disable power aware dispatching
12530e751525SEric Saxe  * The caller must be holding cpu_lock.
12546890d023SEric Saxe  *
12550e751525SEric Saxe  * Return 0 on success and -1 on failure.
12566890d023SEric Saxe  */
12570e751525SEric Saxe int
12580e751525SEric Saxe cmt_pad_enable(pghw_type_t type)
12596890d023SEric Saxe {
12600e751525SEric Saxe 	group_t		*hwset;
12610e751525SEric Saxe 	group_iter_t	iter;
12620e751525SEric Saxe 	pg_cmt_t	*pg;
12636890d023SEric Saxe 
12640e751525SEric Saxe 	ASSERT(PGHW_IS_PM_DOMAIN(type));
12650e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
12666890d023SEric Saxe 
12670e751525SEric Saxe 	if ((hwset = pghw_set_lookup(type)) == NULL ||
12680e751525SEric Saxe 	    cmt_hw_blacklisted[type]) {
12690e751525SEric Saxe 		/*
12700e751525SEric Saxe 		 * Unable to find any instances of the specified type
12710e751525SEric Saxe 		 * of power domain, or the power domains have been blacklisted.
12720e751525SEric Saxe 		 */
12730e751525SEric Saxe 		return (-1);
12740e751525SEric Saxe 	}
12756890d023SEric Saxe 
12766890d023SEric Saxe 	/*
12770e751525SEric Saxe 	 * Iterate over the power domains, setting the default dispatcher
12780e751525SEric Saxe 	 * policy for power/performance optimization.
12790e751525SEric Saxe 	 *
12800e751525SEric Saxe 	 * Simply setting the policy isn't enough in the case where the power
12810e751525SEric Saxe 	 * domain is an only child of another PG. Because the dispatcher walks
12820e751525SEric Saxe 	 * the PG hierarchy in a top down fashion, the higher up PG's policy
12830e751525SEric Saxe 	 * will dominate. So promote the power domain above it's parent if both
12840e751525SEric Saxe 	 * PG and it's parent have the same CPUs to ensure it's policy
12850e751525SEric Saxe 	 * dominates.
12866890d023SEric Saxe 	 */
12870e751525SEric Saxe 	group_iter_init(&iter);
12880e751525SEric Saxe 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
12890e751525SEric Saxe 		/*
12900e751525SEric Saxe 		 * If the power domain is an only child to a parent
12910e751525SEric Saxe 		 * not implementing the same policy, promote the child
12920e751525SEric Saxe 		 * above the parent to activate the policy.
12930e751525SEric Saxe 		 */
12940e751525SEric Saxe 		pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
12950e751525SEric Saxe 		while ((pg->cmt_parent != NULL) &&
12960e751525SEric Saxe 		    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
12970e751525SEric Saxe 		    (PG_NUM_CPUS((pg_t *)pg) ==
12980e751525SEric Saxe 		    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
12991a77c24bSEric Saxe 			cmt_hier_promote(pg, NULL);
13000e751525SEric Saxe 		}
13010e751525SEric Saxe 	}
13020e751525SEric Saxe 
13030e751525SEric Saxe 	return (0);
13040e751525SEric Saxe }
13050e751525SEric Saxe 
13060e751525SEric Saxe int
13070e751525SEric Saxe cmt_pad_disable(pghw_type_t type)
13080e751525SEric Saxe {
13090e751525SEric Saxe 	group_t		*hwset;
13100e751525SEric Saxe 	group_iter_t	iter;
13110e751525SEric Saxe 	pg_cmt_t	*pg;
13120e751525SEric Saxe 	pg_cmt_t	*child;
13130e751525SEric Saxe 
13140e751525SEric Saxe 	ASSERT(PGHW_IS_PM_DOMAIN(type));
13150e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
13160e751525SEric Saxe 
13170e751525SEric Saxe 	if ((hwset = pghw_set_lookup(type)) == NULL) {
13180e751525SEric Saxe 		/*
13190e751525SEric Saxe 		 * Unable to find any instances of the specified type of
13200e751525SEric Saxe 		 * power domain.
13210e751525SEric Saxe 		 */
13220e751525SEric Saxe 		return (-1);
13230e751525SEric Saxe 	}
13240e751525SEric Saxe 	/*
13250e751525SEric Saxe 	 * Iterate over the power domains, setting the default dispatcher
13260e751525SEric Saxe 	 * policy for performance optimization (load balancing).
13270e751525SEric Saxe 	 */
13280e751525SEric Saxe 	group_iter_init(&iter);
13290e751525SEric Saxe 	while ((pg = group_iterate(hwset, &iter)) != NULL) {
13300e751525SEric Saxe 
13310e751525SEric Saxe 		/*
13320e751525SEric Saxe 		 * If the power domain has an only child that implements
13330e751525SEric Saxe 		 * policy other than load balancing, promote the child
13340e751525SEric Saxe 		 * above the power domain to ensure it's policy dominates.
13350e751525SEric Saxe 		 */
1336f03808b6SEric Saxe 		if (pg->cmt_children != NULL &&
1337f03808b6SEric Saxe 		    GROUP_SIZE(pg->cmt_children) == 1) {
13380e751525SEric Saxe 			child = GROUP_ACCESS(pg->cmt_children, 0);
13390e751525SEric Saxe 			if ((child->cmt_policy & CMT_BALANCE) == 0) {
13401a77c24bSEric Saxe 				cmt_hier_promote(child, NULL);
13410e751525SEric Saxe 			}
13420e751525SEric Saxe 		}
13430e751525SEric Saxe 		pg->cmt_policy = CMT_BALANCE;
13440e751525SEric Saxe 	}
13450e751525SEric Saxe 	return (0);
13460e751525SEric Saxe }
13470e751525SEric Saxe 
13480e751525SEric Saxe /* ARGSUSED */
13490e751525SEric Saxe static void
13500e751525SEric Saxe cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
13510e751525SEric Saxe 		    kthread_t *new)
13520e751525SEric Saxe {
13530e751525SEric Saxe 	pg_cmt_t	*cmt_pg = (pg_cmt_t *)pg;
13540e751525SEric Saxe 
13550e751525SEric Saxe 	if (old == cp->cpu_idle_thread) {
13560e751525SEric Saxe 		atomic_add_32(&cmt_pg->cmt_utilization, 1);
13570e751525SEric Saxe 	} else if (new == cp->cpu_idle_thread) {
13580e751525SEric Saxe 		atomic_add_32(&cmt_pg->cmt_utilization, -1);
13590e751525SEric Saxe 	}
13600e751525SEric Saxe }
13610e751525SEric Saxe 
13620e751525SEric Saxe /*
13630e751525SEric Saxe  * Macro to test whether a thread is currently runnable on a CPU in a PG.
13640e751525SEric Saxe  */
13650e751525SEric Saxe #define	THREAD_RUNNABLE_IN_PG(t, pg)					\
13660e751525SEric Saxe 	((t)->t_state == TS_RUN &&					\
13670e751525SEric Saxe 	    (t)->t_disp_queue->disp_cpu &&				\
13680e751525SEric Saxe 	    bitset_in_set(&(pg)->cmt_cpus_actv_set,			\
13690e751525SEric Saxe 	    (t)->t_disp_queue->disp_cpu->cpu_seqid))
13700e751525SEric Saxe 
13710e751525SEric Saxe static void
13720e751525SEric Saxe cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
13730e751525SEric Saxe     kthread_t *new)
13740e751525SEric Saxe {
13750e751525SEric Saxe 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
13760e751525SEric Saxe 	cpupm_domain_t	*dom;
13770e751525SEric Saxe 	uint32_t	u;
13780e751525SEric Saxe 
13790e751525SEric Saxe 	if (old == cp->cpu_idle_thread) {
13800e751525SEric Saxe 		ASSERT(new != cp->cpu_idle_thread);
13810e751525SEric Saxe 		u = atomic_add_32_nv(&cmt->cmt_utilization, 1);
13820e751525SEric Saxe 		if (u == 1) {
13830e751525SEric Saxe 			/*
13840e751525SEric Saxe 			 * Notify the CPU power manager that the domain
13850e751525SEric Saxe 			 * is non-idle.
13860e751525SEric Saxe 			 */
13870e751525SEric Saxe 			dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
13880e751525SEric Saxe 			cpupm_utilization_event(cp, now, dom,
13890e751525SEric Saxe 			    CPUPM_DOM_BUSY_FROM_IDLE);
13900e751525SEric Saxe 		}
13910e751525SEric Saxe 	} else if (new == cp->cpu_idle_thread) {
13920e751525SEric Saxe 		ASSERT(old != cp->cpu_idle_thread);
13930e751525SEric Saxe 		u = atomic_add_32_nv(&cmt->cmt_utilization, -1);
13940e751525SEric Saxe 		if (u == 0) {
13950e751525SEric Saxe 			/*
13960e751525SEric Saxe 			 * The domain is idle, notify the CPU power
13970e751525SEric Saxe 			 * manager.
13980e751525SEric Saxe 			 *
13990e751525SEric Saxe 			 * Avoid notifying if the thread is simply migrating
14000e751525SEric Saxe 			 * between CPUs in the domain.
14010e751525SEric Saxe 			 */
14020e751525SEric Saxe 			if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
14030e751525SEric Saxe 				dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
14040e751525SEric Saxe 				cpupm_utilization_event(cp, now, dom,
14050e751525SEric Saxe 				    CPUPM_DOM_IDLE_FROM_BUSY);
14060e751525SEric Saxe 			}
14070e751525SEric Saxe 		}
14080e751525SEric Saxe 	}
14090e751525SEric Saxe }
14100e751525SEric Saxe 
14110e751525SEric Saxe /* ARGSUSED */
14120e751525SEric Saxe static void
14130e751525SEric Saxe cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
14140e751525SEric Saxe {
14150e751525SEric Saxe 	pg_cmt_t	*cmt = (pg_cmt_t *)pg;
14160e751525SEric Saxe 	cpupm_domain_t	*dom;
14170e751525SEric Saxe 
14180e751525SEric Saxe 	dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
14190e751525SEric Saxe 	cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
14200e751525SEric Saxe }
14210e751525SEric Saxe 
14220e751525SEric Saxe /*
14230e751525SEric Saxe  * Return the name of the CMT scheduling policy
14240e751525SEric Saxe  * being implemented across this PG
14250e751525SEric Saxe  */
14260e751525SEric Saxe static char *
14270e751525SEric Saxe pg_cmt_policy_name(pg_t *pg)
14280e751525SEric Saxe {
14290e751525SEric Saxe 	pg_cmt_policy_t policy;
14300e751525SEric Saxe 
14310e751525SEric Saxe 	policy = ((pg_cmt_t *)pg)->cmt_policy;
14320e751525SEric Saxe 
14330e751525SEric Saxe 	if (policy & CMT_AFFINITY) {
14340e751525SEric Saxe 		if (policy & CMT_BALANCE)
14350e751525SEric Saxe 			return ("Load Balancing & Affinity");
14360e751525SEric Saxe 		else if (policy & CMT_COALESCE)
14370e751525SEric Saxe 			return ("Load Coalescence & Affinity");
14386890d023SEric Saxe 		else
14390e751525SEric Saxe 			return ("Affinity");
14400e751525SEric Saxe 	} else {
14410e751525SEric Saxe 		if (policy & CMT_BALANCE)
14420e751525SEric Saxe 			return ("Load Balancing");
14430e751525SEric Saxe 		else if (policy & CMT_COALESCE)
14440e751525SEric Saxe 			return ("Load Coalescence");
14450e751525SEric Saxe 		else
14460e751525SEric Saxe 			return ("None");
14470e751525SEric Saxe 	}
14480e751525SEric Saxe }
14496890d023SEric Saxe 
14506890d023SEric Saxe /*
14510e751525SEric Saxe  * Prune PG, and all other instances of PG's hardware sharing relationship
1452d0e93b69SEric Saxe  * from the CMT PG hierarchy.
14531a77c24bSEric Saxe  *
14541a77c24bSEric Saxe  * This routine operates on the CPU specific processor group data (for the CPUs
14551a77c24bSEric Saxe  * in the PG being pruned), and may be invoked from a context where one CPU's
14561a77c24bSEric Saxe  * PG data is under construction. In this case the argument "pgdata", if not
14571a77c24bSEric Saxe  * NULL, is a reference to the CPU's under-construction PG data.
14586890d023SEric Saxe  */
14590e751525SEric Saxe static int
14601a77c24bSEric Saxe pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
14610e751525SEric Saxe {
14620e751525SEric Saxe 	group_t		*hwset, *children;
14630e751525SEric Saxe 	int		i, j, r, size = *sz;
14640e751525SEric Saxe 	group_iter_t	hw_iter, child_iter;
14650e751525SEric Saxe 	pg_cpu_itr_t	cpu_iter;
14660e751525SEric Saxe 	pg_cmt_t	*pg, *child;
14670e751525SEric Saxe 	cpu_t		*cpu;
14680e751525SEric Saxe 	int		cap_needed;
14690e751525SEric Saxe 	pghw_type_t	hw;
14706890d023SEric Saxe 
14710e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
14726890d023SEric Saxe 
14730e751525SEric Saxe 	hw = ((pghw_t *)pg_bad)->pghw_hw;
14740e751525SEric Saxe 
14750e751525SEric Saxe 	if (hw == PGHW_POW_ACTIVE) {
14760e751525SEric Saxe 		cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
14770e751525SEric Saxe 		    "Event Based CPUPM Unavailable");
14780e751525SEric Saxe 	} else if (hw == PGHW_POW_IDLE) {
14790e751525SEric Saxe 		cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
14800e751525SEric Saxe 		    "Dispatcher assisted CPUPM disabled.");
14810e751525SEric Saxe 	}
14826890d023SEric Saxe 
14836890d023SEric Saxe 	/*
14840e751525SEric Saxe 	 * Find and eliminate the PG from the lineage.
14856890d023SEric Saxe 	 */
14860e751525SEric Saxe 	for (i = 0; i < size; i++) {
14870e751525SEric Saxe 		if (lineage[i] == pg_bad) {
14880e751525SEric Saxe 			for (j = i; j < size - 1; j++)
14890e751525SEric Saxe 				lineage[j] = lineage[j + 1];
14900e751525SEric Saxe 			*sz = size - 1;
14910e751525SEric Saxe 			break;
14920e751525SEric Saxe 		}
14930e751525SEric Saxe 	}
14940e751525SEric Saxe 
14950e751525SEric Saxe 	/*
14960e751525SEric Saxe 	 * We'll prune all instances of the hardware sharing relationship
14970e751525SEric Saxe 	 * represented by pg. But before we do that (and pause CPUs) we need
14980e751525SEric Saxe 	 * to ensure the hierarchy's groups are properly sized.
14990e751525SEric Saxe 	 */
15000e751525SEric Saxe 	hwset = pghw_set_lookup(hw);
15010e751525SEric Saxe 
15020e751525SEric Saxe 	/*
1503d0e93b69SEric Saxe 	 * Blacklist the hardware so future processor groups of this type won't
1504d0e93b69SEric Saxe 	 * participate in CMT thread placement.
1505d0e93b69SEric Saxe 	 *
1506d0e93b69SEric Saxe 	 * XXX
1507d0e93b69SEric Saxe 	 * For heterogeneous system configurations, this might be overkill.
1508d0e93b69SEric Saxe 	 * We may only need to blacklist the illegal PGs, and other instances
1509d0e93b69SEric Saxe 	 * of this hardware sharing relationship may be ok.
15100e751525SEric Saxe 	 */
15110e751525SEric Saxe 	cmt_hw_blacklisted[hw] = 1;
15120e751525SEric Saxe 
15130e751525SEric Saxe 	/*
15140e751525SEric Saxe 	 * For each of the PGs being pruned, ensure sufficient capacity in
15150e751525SEric Saxe 	 * the siblings set for the PG's children
15160e751525SEric Saxe 	 */
15170e751525SEric Saxe 	group_iter_init(&hw_iter);
15180e751525SEric Saxe 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
15190e751525SEric Saxe 		/*
15200e751525SEric Saxe 		 * PG is being pruned, but if it is bringing up more than
15210e751525SEric Saxe 		 * one child, ask for more capacity in the siblings group.
15220e751525SEric Saxe 		 */
15230e751525SEric Saxe 		cap_needed = 0;
15240e751525SEric Saxe 		if (pg->cmt_children &&
15250e751525SEric Saxe 		    GROUP_SIZE(pg->cmt_children) > 1) {
15260e751525SEric Saxe 			cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
15270e751525SEric Saxe 
15280e751525SEric Saxe 			group_expand(pg->cmt_siblings,
15290e751525SEric Saxe 			    GROUP_SIZE(pg->cmt_siblings) + cap_needed);
15300e751525SEric Saxe 
15310e751525SEric Saxe 			/*
15320e751525SEric Saxe 			 * If this is a top level group, also ensure the
15330e751525SEric Saxe 			 * capacity in the root lgrp level CMT grouping.
15340e751525SEric Saxe 			 */
15350e751525SEric Saxe 			if (pg->cmt_parent == NULL &&
15360e751525SEric Saxe 			    pg->cmt_siblings != &cmt_root->cl_pgs) {
15370e751525SEric Saxe 				group_expand(&cmt_root->cl_pgs,
15380e751525SEric Saxe 				    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1539d0e93b69SEric Saxe 				cmt_root->cl_npgs += cap_needed;
15400e751525SEric Saxe 			}
15410e751525SEric Saxe 		}
15420e751525SEric Saxe 	}
15430e751525SEric Saxe 
15440e751525SEric Saxe 	/*
15450e751525SEric Saxe 	 * We're operating on the PG hierarchy. Pause CPUs to ensure
15460e751525SEric Saxe 	 * exclusivity with respect to the dispatcher.
15470e751525SEric Saxe 	 */
15480e751525SEric Saxe 	pause_cpus(NULL);
15490e751525SEric Saxe 
15500e751525SEric Saxe 	/*
15510e751525SEric Saxe 	 * Prune all PG instances of the hardware sharing relationship
15520e751525SEric Saxe 	 * represented by pg.
15530e751525SEric Saxe 	 */
15540e751525SEric Saxe 	group_iter_init(&hw_iter);
15550e751525SEric Saxe 	while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
15560e751525SEric Saxe 
15570e751525SEric Saxe 		/*
15580e751525SEric Saxe 		 * Remove PG from it's group of siblings, if it's there.
15590e751525SEric Saxe 		 */
15600e751525SEric Saxe 		if (pg->cmt_siblings) {
15610e751525SEric Saxe 			(void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
15620e751525SEric Saxe 		}
15630e751525SEric Saxe 		if (pg->cmt_parent == NULL &&
15640e751525SEric Saxe 		    pg->cmt_siblings != &cmt_root->cl_pgs) {
15650e751525SEric Saxe 			(void) group_remove(&cmt_root->cl_pgs, pg,
15660e751525SEric Saxe 			    GRP_NORESIZE);
15670e751525SEric Saxe 		}
1568d0e93b69SEric Saxe 
1569d0e93b69SEric Saxe 		/*
1570d0e93b69SEric Saxe 		 * Indicate that no CMT policy will be implemented across
1571d0e93b69SEric Saxe 		 * this PG.
1572d0e93b69SEric Saxe 		 */
1573d0e93b69SEric Saxe 		pg->cmt_policy = CMT_NO_POLICY;
1574d0e93b69SEric Saxe 
15750e751525SEric Saxe 		/*
1576ef4f35d8SEric Saxe 		 * Move PG's children from it's children set to it's parent's
1577ef4f35d8SEric Saxe 		 * children set. Note that the parent's children set, and PG's
1578ef4f35d8SEric Saxe 		 * siblings set are the same thing.
1579ef4f35d8SEric Saxe 		 *
1580ef4f35d8SEric Saxe 		 * Because we are iterating over the same group that we are
1581ef4f35d8SEric Saxe 		 * operating on (removing the children), first add all of PG's
1582ef4f35d8SEric Saxe 		 * children to the parent's children set, and once we are done
1583ef4f35d8SEric Saxe 		 * iterating, empty PG's children set.
15840e751525SEric Saxe 		 */
15850e751525SEric Saxe 		if (pg->cmt_children != NULL) {
15860e751525SEric Saxe 			children = pg->cmt_children;
15870e751525SEric Saxe 
15880e751525SEric Saxe 			group_iter_init(&child_iter);
15890e751525SEric Saxe 			while ((child = group_iterate(children, &child_iter))
15900e751525SEric Saxe 			    != NULL) {
1591ef4f35d8SEric Saxe 				if (pg->cmt_siblings != NULL) {
15920e751525SEric Saxe 					r = group_add(pg->cmt_siblings, child,
15930e751525SEric Saxe 					    GRP_NORESIZE);
15940e751525SEric Saxe 					ASSERT(r == 0);
1595d0e93b69SEric Saxe 
1596d0e93b69SEric Saxe 					if (pg->cmt_parent == NULL &&
1597d0e93b69SEric Saxe 					    pg->cmt_siblings !=
1598d0e93b69SEric Saxe 					    &cmt_root->cl_pgs) {
1599d0e93b69SEric Saxe 						r = group_add(&cmt_root->cl_pgs,
1600d0e93b69SEric Saxe 						    child, GRP_NORESIZE);
1601d0e93b69SEric Saxe 						ASSERT(r == 0);
1602d0e93b69SEric Saxe 					}
16030e751525SEric Saxe 				}
16040e751525SEric Saxe 			}
1605ef4f35d8SEric Saxe 			group_empty(pg->cmt_children);
16060e751525SEric Saxe 		}
16070e751525SEric Saxe 
16080e751525SEric Saxe 		/*
16090e751525SEric Saxe 		 * Reset the callbacks to the defaults
16100e751525SEric Saxe 		 */
16110e751525SEric Saxe 		pg_callback_set_defaults((pg_t *)pg);
16120e751525SEric Saxe 
16130e751525SEric Saxe 		/*
16140e751525SEric Saxe 		 * Update all the CPU lineages in each of PG's CPUs
16150e751525SEric Saxe 		 */
16160e751525SEric Saxe 		PG_CPU_ITR_INIT(pg, cpu_iter);
16170e751525SEric Saxe 		while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
16180e751525SEric Saxe 			pg_cmt_t	*cpu_pg;
16190e751525SEric Saxe 			group_iter_t	liter;	/* Iterator for the lineage */
16201a77c24bSEric Saxe 			cpu_pg_t	*cpd;	/* CPU's PG data */
16211a77c24bSEric Saxe 
16221a77c24bSEric Saxe 			/*
16231a77c24bSEric Saxe 			 * The CPU's lineage is under construction still
16241a77c24bSEric Saxe 			 * references the bootstrap CPU PG data structure.
16251a77c24bSEric Saxe 			 */
16261a77c24bSEric Saxe 			if (pg_cpu_is_bootstrapped(cpu))
16271a77c24bSEric Saxe 				cpd = pgdata;
16281a77c24bSEric Saxe 			else
16291a77c24bSEric Saxe 				cpd = cpu->cpu_pg;
16300e751525SEric Saxe 
16310e751525SEric Saxe 			/*
16320e751525SEric Saxe 			 * Iterate over the CPU's PGs updating the children
16330e751525SEric Saxe 			 * of the PG being promoted, since they have a new
16340e751525SEric Saxe 			 * parent and siblings set.
16350e751525SEric Saxe 			 */
16360e751525SEric Saxe 			group_iter_init(&liter);
16371a77c24bSEric Saxe 			while ((cpu_pg = group_iterate(&cpd->pgs,
16381a77c24bSEric Saxe 			    &liter)) != NULL) {
16390e751525SEric Saxe 				if (cpu_pg->cmt_parent == pg) {
16400e751525SEric Saxe 					cpu_pg->cmt_parent = pg->cmt_parent;
16410e751525SEric Saxe 					cpu_pg->cmt_siblings = pg->cmt_siblings;
16420e751525SEric Saxe 				}
16430e751525SEric Saxe 			}
16440e751525SEric Saxe 
16450e751525SEric Saxe 			/*
16460e751525SEric Saxe 			 * Update the CPU's lineages
1647d0e93b69SEric Saxe 			 *
1648d0e93b69SEric Saxe 			 * Remove the PG from the CPU's group used for CMT
1649d0e93b69SEric Saxe 			 * scheduling.
16500e751525SEric Saxe 			 */
16511a77c24bSEric Saxe 			(void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
16520e751525SEric Saxe 		}
16530e751525SEric Saxe 	}
16540e751525SEric Saxe 	start_cpus();
16550e751525SEric Saxe 	return (0);
16560e751525SEric Saxe }
16570e751525SEric Saxe 
16580e751525SEric Saxe /*
16590e751525SEric Saxe  * Disable CMT scheduling
16600e751525SEric Saxe  */
16610e751525SEric Saxe static void
16620e751525SEric Saxe pg_cmt_disable(void)
16630e751525SEric Saxe {
16640e751525SEric Saxe 	cpu_t		*cpu;
16650e751525SEric Saxe 
16661a77c24bSEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
16671a77c24bSEric Saxe 
16680e751525SEric Saxe 	pause_cpus(NULL);
16690e751525SEric Saxe 	cpu = cpu_list;
16700e751525SEric Saxe 
16716890d023SEric Saxe 	do {
16720e751525SEric Saxe 		if (cpu->cpu_pg)
16730e751525SEric Saxe 			group_empty(&cpu->cpu_pg->cmt_pgs);
16740e751525SEric Saxe 	} while ((cpu = cpu->cpu_next) != cpu_list);
16750e751525SEric Saxe 
16760e751525SEric Saxe 	cmt_sched_disabled = 1;
16770e751525SEric Saxe 	start_cpus();
16780e751525SEric Saxe 	cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
16790e751525SEric Saxe }
16800e751525SEric Saxe 
1681ef4f35d8SEric Saxe /*
1682ef4f35d8SEric Saxe  * CMT lineage validation
1683ef4f35d8SEric Saxe  *
1684ef4f35d8SEric Saxe  * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
1685ef4f35d8SEric Saxe  * of the PGs in a CPU's lineage. This is necessary because it's possible that
1686ef4f35d8SEric Saxe  * some groupings (power domain groupings in particular) may be defined by
1687ef4f35d8SEric Saxe  * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
1688ef4f35d8SEric Saxe  * possible to integrate those groupings into the CMT PG hierarchy, if doing
1689ef4f35d8SEric Saxe  * so would violate the subset invariant of the hierarchy, which says that
1690ef4f35d8SEric Saxe  * a PG must be subset of its parent (if it has one).
1691ef4f35d8SEric Saxe  *
1692ef4f35d8SEric Saxe  * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
1693ef4f35d8SEric Saxe  * would result in a violation of this invariant. If a violation is found,
1694ef4f35d8SEric Saxe  * and the PG is of a grouping type who's definition is known to originate from
1695ef4f35d8SEric Saxe  * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
1696b025faeeSEric Saxe  * PG (and all other instances PG's sharing relationship type) from the CMT
1697ef4f35d8SEric Saxe  * hierarchy. Further, future instances of that sharing relationship type won't
1698b025faeeSEric Saxe  * be added. If the grouping definition doesn't originate from suspect
1699ef4f35d8SEric Saxe  * sources, then pg_cmt_disable() will be invoked to log an error, and disable
1700ef4f35d8SEric Saxe  * CMT scheduling altogether.
1701ef4f35d8SEric Saxe  *
1702ef4f35d8SEric Saxe  * This routine is invoked after the CPU has been added to the PGs in which
1703ef4f35d8SEric Saxe  * it belongs, but before those PGs have been added to (or had their place
1704ef4f35d8SEric Saxe  * adjusted in) the CMT PG hierarchy.
1705ef4f35d8SEric Saxe  *
1706ef4f35d8SEric Saxe  * The first argument is the CPUs PG lineage (essentially an array of PGs in
1707ef4f35d8SEric Saxe  * which the CPU belongs) that has already been sorted in ascending order
1708ef4f35d8SEric Saxe  * by CPU count. Some of the PGs in the CPUs lineage may already have other
1709ef4f35d8SEric Saxe  * CPUs in them, and have already been integrated into the CMT hierarchy.
1710ef4f35d8SEric Saxe  *
1711ef4f35d8SEric Saxe  * The addition of this new CPU to these pre-existing PGs means that those
1712ef4f35d8SEric Saxe  * PGs may need to be promoted up in the hierarchy to satisfy the subset
1713ef4f35d8SEric Saxe  * invariant. In additon to testing the subset invariant for the lineage,
1714ef4f35d8SEric Saxe  * this routine also verifies that the addition of the new CPU to the
1715ef4f35d8SEric Saxe  * existing PGs wouldn't cause the subset invariant to be violated in
1716ef4f35d8SEric Saxe  * the exiting lineages.
1717ef4f35d8SEric Saxe  *
1718ef4f35d8SEric Saxe  * This routine will normally return one of the following:
1719ef4f35d8SEric Saxe  * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
1720ef4f35d8SEric Saxe  * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
1721ef4f35d8SEric Saxe  *
1722ef4f35d8SEric Saxe  * Otherwise, this routine will return a value indicating which error it
1723ef4f35d8SEric Saxe  * was unable to recover from (and set cmt_lineage_status along the way).
17241a77c24bSEric Saxe  *
17251a77c24bSEric Saxe  * This routine operates on the CPU specific processor group data (for the CPU
17261a77c24bSEric Saxe  * whose lineage is being validated), which is under-construction.
17271a77c24bSEric Saxe  * "pgdata" is a reference to the CPU's under-construction PG data.
17281a77c24bSEric Saxe  * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
1729ef4f35d8SEric Saxe  */
1730ef4f35d8SEric Saxe static cmt_lineage_validation_t
17311a77c24bSEric Saxe pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
17320e751525SEric Saxe {
1733ef4f35d8SEric Saxe 	int		i, j, size;
1734b025faeeSEric Saxe 	pg_cmt_t	*pg, *pg_next, *pg_bad, *pg_tmp, *parent;
17350e751525SEric Saxe 	cpu_t		*cp;
17360e751525SEric Saxe 	pg_cpu_itr_t	cpu_iter;
1737ef4f35d8SEric Saxe 	lgrp_handle_t	lgrp;
17380e751525SEric Saxe 
17390e751525SEric Saxe 	ASSERT(MUTEX_HELD(&cpu_lock));
17400e751525SEric Saxe 
17410e751525SEric Saxe revalidate:
17420e751525SEric Saxe 	size = *sz;
17430e751525SEric Saxe 	pg_bad = NULL;
1744ef4f35d8SEric Saxe 	lgrp = LGRP_NULL_HANDLE;
1745ef4f35d8SEric Saxe 	for (i = 0; i < size; i++) {
17460e751525SEric Saxe 
17470e751525SEric Saxe 		pg = lineage[i];
1748ef4f35d8SEric Saxe 		if (i < size - 1)
1749ef4f35d8SEric Saxe 			pg_next = lineage[i + 1];
1750ef4f35d8SEric Saxe 		else
1751ef4f35d8SEric Saxe 			pg_next = NULL;
17526890d023SEric Saxe 
17536890d023SEric Saxe 		/*
17540e751525SEric Saxe 		 * We assume that the lineage has already been sorted
17550e751525SEric Saxe 		 * by the number of CPUs. In fact, we depend on it.
17566890d023SEric Saxe 		 */
1757ef4f35d8SEric Saxe 		ASSERT(pg_next == NULL ||
1758ef4f35d8SEric Saxe 		    (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));
17596890d023SEric Saxe 
17606890d023SEric Saxe 		/*
1761b025faeeSEric Saxe 		 * The CPUs PG lineage was passed as the first argument to
1762b025faeeSEric Saxe 		 * this routine and contains the sorted list of the CPU's
1763b025faeeSEric Saxe 		 * PGs. Ultimately, the ordering of the PGs in that list, and
1764b025faeeSEric Saxe 		 * the ordering as traversed by the cmt_parent list must be
1765b025faeeSEric Saxe 		 * the same. PG promotion will be used as the mechanism to
1766b025faeeSEric Saxe 		 * achieve this, but first we need to look for cases where
1767b025faeeSEric Saxe 		 * promotion will be necessary, and validate that will be
1768b025faeeSEric Saxe 		 * possible without violating the subset invarient described
1769b025faeeSEric Saxe 		 * above.
1770ef4f35d8SEric Saxe 		 *
1771ef4f35d8SEric Saxe 		 * Since the PG topology is in the middle of being changed, we
1772ef4f35d8SEric Saxe 		 * need to check whether the PG's existing parent (if any) is
1773b025faeeSEric Saxe 		 * part of this CPU's lineage (and therefore should contain
1774b025faeeSEric Saxe 		 * the new CPU). If not, it means that the addition of the
1775b025faeeSEric Saxe 		 * new CPU should have made this PG have more CPUs than its
1776b025faeeSEric Saxe 		 * parent (and other ancestors not in the same lineage) and
1777b025faeeSEric Saxe 		 * will need to be promoted into place.
1778b025faeeSEric Saxe 		 *
1779b025faeeSEric Saxe 		 * We need to verify all of this to defend against a buggy
1780ef4f35d8SEric Saxe 		 * BIOS giving bad power domain CPU groupings. Sigh.
1781ef4f35d8SEric Saxe 		 */
1782b025faeeSEric Saxe 		parent = pg->cmt_parent;
1783b025faeeSEric Saxe 		while (parent != NULL) {
1784ef4f35d8SEric Saxe 			/*
1785b025faeeSEric Saxe 			 * Determine if the parent/ancestor is in this lineage
1786ef4f35d8SEric Saxe 			 */
1787b025faeeSEric Saxe 			pg_tmp = NULL;
1788b025faeeSEric Saxe 			for (j = 0; (j < size) && (pg_tmp != parent); j++) {
1789ef4f35d8SEric Saxe 				pg_tmp = lineage[j];
1790b025faeeSEric Saxe 			}
1791b025faeeSEric Saxe 			if (pg_tmp == parent) {
1792b025faeeSEric Saxe 				/*
1793b025faeeSEric Saxe 				 * It's in the lineage. The concentricity
1794b025faeeSEric Saxe 				 * checks will handle the rest.
1795b025faeeSEric Saxe 				 */
1796ef4f35d8SEric Saxe 				break;
1797ef4f35d8SEric Saxe 			}
1798ef4f35d8SEric Saxe 			/*
1799b025faeeSEric Saxe 			 * If it is not in the lineage, PG will eventually
1800b025faeeSEric Saxe 			 * need to be promoted above it. Verify the ancestor
1801b025faeeSEric Saxe 			 * is a proper subset. There is still an error if
1802b025faeeSEric Saxe 			 * the ancestor has the same number of CPUs as PG,
1803b025faeeSEric Saxe 			 * since that would imply it should be in the lineage,
1804b025faeeSEric Saxe 			 * and we already know it isn't.
1805ef4f35d8SEric Saxe 			 */
1806b025faeeSEric Saxe 			if (PG_NUM_CPUS((pg_t *)parent) >=
1807ef4f35d8SEric Saxe 			    PG_NUM_CPUS((pg_t *)pg)) {
1808ef4f35d8SEric Saxe 				/*
1809b025faeeSEric Saxe 				 * Not a proper subset if the parent/ancestor
1810b025faeeSEric Saxe 				 * has the same or more CPUs than PG.
1811ef4f35d8SEric Saxe 				 */
1812b025faeeSEric Saxe 				cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE;
1813ef4f35d8SEric Saxe 				goto handle_error;
1814ef4f35d8SEric Saxe 			}
1815b025faeeSEric Saxe 			parent = parent->cmt_parent;
1816ef4f35d8SEric Saxe 		}
1817ef4f35d8SEric Saxe 
1818ef4f35d8SEric Saxe 		/*
1819ef4f35d8SEric Saxe 		 * Walk each of the CPUs in the PGs group and perform
1820ef4f35d8SEric Saxe 		 * consistency checks along the way.
18216890d023SEric Saxe 		 */
18220e751525SEric Saxe 		PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
18230e751525SEric Saxe 		while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1824ef4f35d8SEric Saxe 			/*
1825ef4f35d8SEric Saxe 			 * Verify that there aren't any CPUs contained in PG
1826ef4f35d8SEric Saxe 			 * that the next PG in the lineage (which is larger
1827ef4f35d8SEric Saxe 			 * or same size) doesn't also contain.
1828ef4f35d8SEric Saxe 			 */
1829ef4f35d8SEric Saxe 			if (pg_next != NULL &&
1830ef4f35d8SEric Saxe 			    pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
18310e751525SEric Saxe 				cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
18320e751525SEric Saxe 				goto handle_error;
18336890d023SEric Saxe 			}
1834ef4f35d8SEric Saxe 
1835ef4f35d8SEric Saxe 			/*
1836ef4f35d8SEric Saxe 			 * Verify that all the CPUs in the PG are in the same
1837ef4f35d8SEric Saxe 			 * lgroup.
1838ef4f35d8SEric Saxe 			 */
1839ef4f35d8SEric Saxe 			if (lgrp == LGRP_NULL_HANDLE) {
1840ef4f35d8SEric Saxe 				lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
1841ef4f35d8SEric Saxe 			} else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
1842ef4f35d8SEric Saxe 				cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
1843ef4f35d8SEric Saxe 				goto handle_error;
1844ef4f35d8SEric Saxe 			}
18450e751525SEric Saxe 		}
18466890d023SEric Saxe 	}
18476890d023SEric Saxe 
18480e751525SEric Saxe handle_error:
1849ef4f35d8SEric Saxe 	/*
1850ef4f35d8SEric Saxe 	 * Some of these validation errors can result when the CPU grouping
1851ef4f35d8SEric Saxe 	 * information is derived from buggy sources (for example, incorrect
1852ef4f35d8SEric Saxe 	 * ACPI tables on x86 systems).
1853ef4f35d8SEric Saxe 	 *
1854ef4f35d8SEric Saxe 	 * We'll try to recover in such cases by pruning out the illegal
1855ef4f35d8SEric Saxe 	 * groupings from the PG hierarchy, which means that we won't optimize
1856ef4f35d8SEric Saxe 	 * for those levels, but we will for the remaining ones.
1857ef4f35d8SEric Saxe 	 */
18580e751525SEric Saxe 	switch (cmt_lineage_status) {
18590e751525SEric Saxe 	case CMT_LINEAGE_VALID:
18600e751525SEric Saxe 	case CMT_LINEAGE_REPAIRED:
18610e751525SEric Saxe 		break;
1862ef4f35d8SEric Saxe 	case CMT_LINEAGE_PG_SPANS_LGRPS:
1863ef4f35d8SEric Saxe 		/*
1864ef4f35d8SEric Saxe 		 * We've detected a PG whose CPUs span lgroups.
1865ef4f35d8SEric Saxe 		 *
1866ef4f35d8SEric Saxe 		 * This isn't supported, as the dispatcher isn't allowed to
1867ef4f35d8SEric Saxe 		 * to do CMT thread placement across lgroups, as this would
1868ef4f35d8SEric Saxe 		 * conflict with policies implementing MPO thread affinity.
1869ef4f35d8SEric Saxe 		 *
1870d0e93b69SEric Saxe 		 * If the PG is of a sharing relationship type known to
1871d0e93b69SEric Saxe 		 * legitimately span lgroups, specify that no CMT thread
1872d0e93b69SEric Saxe 		 * placement policy should be implemented, and prune the PG
1873d0e93b69SEric Saxe 		 * from the existing CMT PG hierarchy.
1874d0e93b69SEric Saxe 		 *
1875d0e93b69SEric Saxe 		 * Otherwise, fall though to the case below for handling.
1876ef4f35d8SEric Saxe 		 */
1877d0e93b69SEric Saxe 		if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) {
1878d0e93b69SEric Saxe 			if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1879d0e93b69SEric Saxe 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1880d0e93b69SEric Saxe 				goto revalidate;
1881d0e93b69SEric Saxe 			}
1882d0e93b69SEric Saxe 		}
1883d0e93b69SEric Saxe 		/*LINTED*/
1884ef4f35d8SEric Saxe 	case CMT_LINEAGE_NON_PROMOTABLE:
1885ef4f35d8SEric Saxe 		/*
1886ef4f35d8SEric Saxe 		 * We've detected a PG that already exists in another CPU's
1887ef4f35d8SEric Saxe 		 * lineage that cannot cannot legally be promoted into place
1888ef4f35d8SEric Saxe 		 * without breaking the invariants of the hierarchy.
1889ef4f35d8SEric Saxe 		 */
1890ef4f35d8SEric Saxe 		if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
18911a77c24bSEric Saxe 			if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1892ef4f35d8SEric Saxe 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1893ef4f35d8SEric Saxe 				goto revalidate;
1894ef4f35d8SEric Saxe 			}
1895ef4f35d8SEric Saxe 		}
1896ef4f35d8SEric Saxe 		/*
1897ef4f35d8SEric Saxe 		 * Something went wrong trying to prune out the bad level.
1898ef4f35d8SEric Saxe 		 * Disable CMT scheduling altogether.
1899ef4f35d8SEric Saxe 		 */
1900ef4f35d8SEric Saxe 		pg_cmt_disable();
1901ef4f35d8SEric Saxe 		break;
19020e751525SEric Saxe 	case CMT_LINEAGE_NON_CONCENTRIC:
19036890d023SEric Saxe 		/*
1904ef4f35d8SEric Saxe 		 * We've detected a non-concentric PG lineage, which means that
1905ef4f35d8SEric Saxe 		 * there's a PG in the lineage that has CPUs that the next PG
1906ef4f35d8SEric Saxe 		 * over in the lineage (which is the same size or larger)
1907ef4f35d8SEric Saxe 		 * doesn't have.
19080e751525SEric Saxe 		 *
1909ef4f35d8SEric Saxe 		 * In this case, we examine the two PGs to see if either
1910ef4f35d8SEric Saxe 		 * grouping is defined by potentially buggy sources.
19110e751525SEric Saxe 		 *
19120e751525SEric Saxe 		 * If one has less CPUs than the other, and contains CPUs
19130e751525SEric Saxe 		 * not found in the parent, and it is an untrusted enumeration,
19140e751525SEric Saxe 		 * then prune it. If both have the same number of CPUs, then
19150e751525SEric Saxe 		 * prune the one that is untrusted.
19160e751525SEric Saxe 		 *
19170e751525SEric Saxe 		 * This process repeats until we have a concentric lineage,
19180e751525SEric Saxe 		 * or we would have to prune out level derived from what we
19190e751525SEric Saxe 		 * thought was a reliable source, in which case CMT scheduling
1920ef4f35d8SEric Saxe 		 * is disabled altogether.
19216890d023SEric Saxe 		 */
1922ef4f35d8SEric Saxe 		if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
19230e751525SEric Saxe 		    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
19240e751525SEric Saxe 			pg_bad = pg;
19250e751525SEric Saxe 		} else if (PG_NUM_CPUS((pg_t *)pg) ==
1926ef4f35d8SEric Saxe 		    PG_NUM_CPUS((pg_t *)pg_next)) {
1927ef4f35d8SEric Saxe 			if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
1928ef4f35d8SEric Saxe 				pg_bad = pg_next;
19290e751525SEric Saxe 			} else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
19300e751525SEric Saxe 				pg_bad = pg;
19316890d023SEric Saxe 			}
19326890d023SEric Saxe 		}
19330e751525SEric Saxe 		if (pg_bad) {
19341a77c24bSEric Saxe 			if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
19350e751525SEric Saxe 				cmt_lineage_status = CMT_LINEAGE_REPAIRED;
19360e751525SEric Saxe 				goto revalidate;
19370e751525SEric Saxe 			}
19380e751525SEric Saxe 		}
19390e751525SEric Saxe 		/*
1940ef4f35d8SEric Saxe 		 * Something went wrong trying to identify and/or prune out
1941ef4f35d8SEric Saxe 		 * the bad level. Disable CMT scheduling altogether.
19420e751525SEric Saxe 		 */
19430e751525SEric Saxe 		pg_cmt_disable();
1944ef4f35d8SEric Saxe 		break;
1945ef4f35d8SEric Saxe 	default:
1946ef4f35d8SEric Saxe 		/*
1947ef4f35d8SEric Saxe 		 * If we're here, we've encountered a validation error for
1948ef4f35d8SEric Saxe 		 * which we don't know how to recover. In this case, disable
1949ef4f35d8SEric Saxe 		 * CMT scheduling altogether.
1950ef4f35d8SEric Saxe 		 */
19510e751525SEric Saxe 		cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1952ef4f35d8SEric Saxe 		pg_cmt_disable();
19530e751525SEric Saxe 	}
1954ef4f35d8SEric Saxe 	return (cmt_lineage_status);
19556890d023SEric Saxe }
1956