1fb2f18f8Sesaxe /*
2fb2f18f8Sesaxe * CDDL HEADER START
3fb2f18f8Sesaxe *
4fb2f18f8Sesaxe * The contents of this file are subject to the terms of the
5fb2f18f8Sesaxe * Common Development and Distribution License (the "License").
6fb2f18f8Sesaxe * You may not use this file except in compliance with the License.
7fb2f18f8Sesaxe *
8fb2f18f8Sesaxe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fb2f18f8Sesaxe * or http://www.opensolaris.org/os/licensing.
10fb2f18f8Sesaxe * See the License for the specific language governing permissions
11fb2f18f8Sesaxe * and limitations under the License.
12fb2f18f8Sesaxe *
13fb2f18f8Sesaxe * When distributing Covered Code, include this CDDL HEADER in each
14fb2f18f8Sesaxe * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fb2f18f8Sesaxe * If applicable, add the following below this CDDL HEADER, with the
16fb2f18f8Sesaxe * fields enclosed by brackets "[]" replaced with your own identifying
17fb2f18f8Sesaxe * information: Portions Copyright [yyyy] [name of copyright owner]
18fb2f18f8Sesaxe *
19fb2f18f8Sesaxe * CDDL HEADER END
20fb2f18f8Sesaxe */
21fb2f18f8Sesaxe /*
22d3c97224SAlexander Kolbasov * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23fb2f18f8Sesaxe */
24fb2f18f8Sesaxe
25fb2f18f8Sesaxe #include <sys/systm.h>
26fb2f18f8Sesaxe #include <sys/types.h>
27fb2f18f8Sesaxe #include <sys/param.h>
28fb2f18f8Sesaxe #include <sys/thread.h>
29fb2f18f8Sesaxe #include <sys/cpuvar.h>
30fb2f18f8Sesaxe #include <sys/cpupart.h>
31fb2f18f8Sesaxe #include <sys/kmem.h>
32fb2f18f8Sesaxe #include <sys/cmn_err.h>
33fb2f18f8Sesaxe #include <sys/kstat.h>
34fb2f18f8Sesaxe #include <sys/processor.h>
35fb2f18f8Sesaxe #include <sys/disp.h>
36fb2f18f8Sesaxe #include <sys/group.h>
37fb2f18f8Sesaxe #include <sys/pghw.h>
38fb2f18f8Sesaxe #include <sys/bitset.h>
39fb2f18f8Sesaxe #include <sys/lgrp.h>
40fb2f18f8Sesaxe #include <sys/cmt.h>
410e751525SEric Saxe #include <sys/cpu_pm.h>
42fb2f18f8Sesaxe
43fb2f18f8Sesaxe /*
44fb2f18f8Sesaxe * CMT scheduler / dispatcher support
45fb2f18f8Sesaxe *
46fb2f18f8Sesaxe * This file implements CMT scheduler support using Processor Groups.
47fb2f18f8Sesaxe * The CMT processor group class creates and maintains the CMT class
48fb2f18f8Sesaxe * specific processor group pg_cmt_t.
49fb2f18f8Sesaxe *
50fb2f18f8Sesaxe * ---------------------------- <-- pg_cmt_t *
51fb2f18f8Sesaxe * | pghw_t |
52fb2f18f8Sesaxe * ----------------------------
53fb2f18f8Sesaxe * | CMT class specific data |
54fb2f18f8Sesaxe * | - hierarchy linkage |
55fb2f18f8Sesaxe * | - CMT load balancing data|
56fb2f18f8Sesaxe * | - active CPU group/bitset|
57fb2f18f8Sesaxe * ----------------------------
58fb2f18f8Sesaxe *
59fb2f18f8Sesaxe * The scheduler/dispatcher leverages knowledge of the performance
60fb2f18f8Sesaxe * relevant CMT sharing relationships existing between cpus to implement
610e751525SEric Saxe * optimized affinity, load balancing, and coalescence policies.
62fb2f18f8Sesaxe *
63fb2f18f8Sesaxe * Load balancing policy seeks to improve performance by minimizing
640e751525SEric Saxe * contention over shared processor resources / facilities, Affinity
650e751525SEric Saxe * policies seek to improve cache and TLB utilization. Coalescence
660e751525SEric Saxe * policies improve resource utilization and ultimately power efficiency.
67fb2f18f8Sesaxe *
68fb2f18f8Sesaxe * The CMT PGs created by this class are already arranged into a
69fb2f18f8Sesaxe * hierarchy (which is done in the pghw layer). To implement the top-down
70fb2f18f8Sesaxe * CMT load balancing algorithm, the CMT PGs additionally maintain
71fb2f18f8Sesaxe * parent, child and sibling hierarchy relationships.
72fb2f18f8Sesaxe * Parent PGs always contain a superset of their children(s) resources,
73fb2f18f8Sesaxe * each PG can have at most one parent, and siblings are the group of PGs
74fb2f18f8Sesaxe * sharing the same parent.
75fb2f18f8Sesaxe *
76d0e93b69SEric Saxe * On UMA based systems, the CMT load balancing algorithm begins by balancing
77d0e93b69SEric Saxe * load across the group of top level PGs in the system hierarchy.
78d0e93b69SEric Saxe * On NUMA systems, the CMT load balancing algorithm balances load across the
79d0e93b69SEric Saxe * group of top level PGs in each leaf lgroup...but for root homed threads,
80d0e93b69SEric Saxe * is willing to balance against all the top level PGs in the system.
81d0e93b69SEric Saxe *
82d0e93b69SEric Saxe * Groups of top level PGs are maintained to implement the above, one for each
83d0e93b69SEric Saxe * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
84d0e93b69SEric Saxe * root lgroup) that contains all the top level PGs in the system.
85fb2f18f8Sesaxe */
86a6604450Sesaxe static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */
87a6604450Sesaxe static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */
88a6604450Sesaxe /* used for null_proc_lpa */
890e751525SEric Saxe cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */
90fb2f18f8Sesaxe
91a6604450Sesaxe static int is_cpu0 = 1; /* true if this is boot CPU context */
92a6604450Sesaxe
93a6604450Sesaxe /*
940e751525SEric Saxe * Array of hardware sharing relationships that are blacklisted.
95d0e93b69SEric Saxe * CMT scheduling optimizations won't be performed for blacklisted sharing
96d0e93b69SEric Saxe * relationships.
970e751525SEric Saxe */
980e751525SEric Saxe static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
990e751525SEric Saxe
1000e751525SEric Saxe /*
101a6604450Sesaxe * Set this to non-zero to disable CMT scheduling
102a6604450Sesaxe * This must be done via kmdb -d, as /etc/system will be too late
103a6604450Sesaxe */
1040e751525SEric Saxe int cmt_sched_disabled = 0;
105fb2f18f8Sesaxe
106ef4f35d8SEric Saxe /*
107ef4f35d8SEric Saxe * Status codes for CMT lineage validation
108ef4f35d8SEric Saxe * See pg_cmt_lineage_validate() below
109ef4f35d8SEric Saxe */
110ef4f35d8SEric Saxe typedef enum cmt_lineage_validation {
111ef4f35d8SEric Saxe CMT_LINEAGE_VALID,
112ef4f35d8SEric Saxe CMT_LINEAGE_NON_CONCENTRIC,
113ef4f35d8SEric Saxe CMT_LINEAGE_PG_SPANS_LGRPS,
114ef4f35d8SEric Saxe CMT_LINEAGE_NON_PROMOTABLE,
115ef4f35d8SEric Saxe CMT_LINEAGE_REPAIRED,
116ef4f35d8SEric Saxe CMT_LINEAGE_UNRECOVERABLE
117ef4f35d8SEric Saxe } cmt_lineage_validation_t;
118ef4f35d8SEric Saxe
119ef4f35d8SEric Saxe /*
120ef4f35d8SEric Saxe * Status of the current lineage under construction.
121ef4f35d8SEric Saxe * One must be holding cpu_lock to change this.
122ef4f35d8SEric Saxe */
123ef4f35d8SEric Saxe cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID;
124ef4f35d8SEric Saxe
125ef4f35d8SEric Saxe /*
126ef4f35d8SEric Saxe * Power domain definitions (on x86) are defined by ACPI, and
127ef4f35d8SEric Saxe * therefore may be subject to BIOS bugs.
128ef4f35d8SEric Saxe */
129ef4f35d8SEric Saxe #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw)
130ef4f35d8SEric Saxe
131ef4f35d8SEric Saxe /*
132ef4f35d8SEric Saxe * Macro to test if PG is managed by the CMT PG class
133ef4f35d8SEric Saxe */
134ef4f35d8SEric Saxe #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
135ef4f35d8SEric Saxe
136fb2f18f8Sesaxe static pg_cid_t pg_cmt_class_id; /* PG class id */
137fb2f18f8Sesaxe
138fb2f18f8Sesaxe static pg_t *pg_cmt_alloc();
139fb2f18f8Sesaxe static void pg_cmt_free(pg_t *);
14047ab0c7cSEric Saxe static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
14147ab0c7cSEric Saxe static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
142fb2f18f8Sesaxe static void pg_cmt_cpu_active(cpu_t *);
143fb2f18f8Sesaxe static void pg_cmt_cpu_inactive(cpu_t *);
144fb2f18f8Sesaxe static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
145fb2f18f8Sesaxe static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
1460e751525SEric Saxe static char *pg_cmt_policy_name(pg_t *);
1470e751525SEric Saxe static void pg_cmt_hier_sort(pg_cmt_t **, int);
1480e751525SEric Saxe static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
149fb2f18f8Sesaxe static int pg_cmt_cpu_belongs(pg_t *, cpu_t *);
150fb2f18f8Sesaxe static int pg_cmt_hw(pghw_type_t);
151fb2f18f8Sesaxe static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t);
152a6604450Sesaxe static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t);
1530e751525SEric Saxe static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
1540e751525SEric Saxe kthread_t *, kthread_t *);
1550e751525SEric Saxe static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
1560e751525SEric Saxe kthread_t *, kthread_t *);
1570e751525SEric Saxe static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
1581a77c24bSEric Saxe static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *,
1591a77c24bSEric Saxe cpu_pg_t *);
160fb2f18f8Sesaxe
1610e751525SEric Saxe /*
162fb2f18f8Sesaxe * CMT PG ops
163fb2f18f8Sesaxe */
164fb2f18f8Sesaxe struct pg_ops pg_ops_cmt = {
165fb2f18f8Sesaxe pg_cmt_alloc,
166fb2f18f8Sesaxe pg_cmt_free,
167fb2f18f8Sesaxe pg_cmt_cpu_init,
168fb2f18f8Sesaxe pg_cmt_cpu_fini,
169fb2f18f8Sesaxe pg_cmt_cpu_active,
170fb2f18f8Sesaxe pg_cmt_cpu_inactive,
171fb2f18f8Sesaxe pg_cmt_cpupart_in,
172fb2f18f8Sesaxe NULL, /* cpupart_out */
173fb2f18f8Sesaxe pg_cmt_cpupart_move,
174fb2f18f8Sesaxe pg_cmt_cpu_belongs,
1750e751525SEric Saxe pg_cmt_policy_name,
176fb2f18f8Sesaxe };
177fb2f18f8Sesaxe
178fb2f18f8Sesaxe /*
179fb2f18f8Sesaxe * Initialize the CMT PG class
180fb2f18f8Sesaxe */
181fb2f18f8Sesaxe void
pg_cmt_class_init(void)182fb2f18f8Sesaxe pg_cmt_class_init(void)
183fb2f18f8Sesaxe {
184fb2f18f8Sesaxe if (cmt_sched_disabled)
185fb2f18f8Sesaxe return;
186fb2f18f8Sesaxe
187fb2f18f8Sesaxe pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
188fb2f18f8Sesaxe }
189fb2f18f8Sesaxe
190fb2f18f8Sesaxe /*
191fb2f18f8Sesaxe * Called to indicate a new CPU has started up so
192fb2f18f8Sesaxe * that either t0 or the slave startup thread can
193fb2f18f8Sesaxe * be accounted for.
194fb2f18f8Sesaxe */
195fb2f18f8Sesaxe void
pg_cmt_cpu_startup(cpu_t * cp)196fb2f18f8Sesaxe pg_cmt_cpu_startup(cpu_t *cp)
197fb2f18f8Sesaxe {
1980e751525SEric Saxe pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
1990e751525SEric Saxe cp->cpu_thread);
200fb2f18f8Sesaxe }
201fb2f18f8Sesaxe
202fb2f18f8Sesaxe /*
203fb2f18f8Sesaxe * Return non-zero if thread can migrate between "from" and "to"
204fb2f18f8Sesaxe * without a performance penalty
205fb2f18f8Sesaxe */
206fb2f18f8Sesaxe int
pg_cmt_can_migrate(cpu_t * from,cpu_t * to)207fb2f18f8Sesaxe pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
208fb2f18f8Sesaxe {
209fb2f18f8Sesaxe if (from->cpu_physid->cpu_cacheid ==
210fb2f18f8Sesaxe to->cpu_physid->cpu_cacheid)
211fb2f18f8Sesaxe return (1);
212fb2f18f8Sesaxe return (0);
213fb2f18f8Sesaxe }
214fb2f18f8Sesaxe
215fb2f18f8Sesaxe /*
216fb2f18f8Sesaxe * CMT class specific PG allocation
217fb2f18f8Sesaxe */
218fb2f18f8Sesaxe static pg_t *
pg_cmt_alloc(void)219fb2f18f8Sesaxe pg_cmt_alloc(void)
220fb2f18f8Sesaxe {
221fb2f18f8Sesaxe return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
222fb2f18f8Sesaxe }
223fb2f18f8Sesaxe
224fb2f18f8Sesaxe /*
225fb2f18f8Sesaxe * Class specific PG de-allocation
226fb2f18f8Sesaxe */
227fb2f18f8Sesaxe static void
pg_cmt_free(pg_t * pg)228fb2f18f8Sesaxe pg_cmt_free(pg_t *pg)
229fb2f18f8Sesaxe {
230fb2f18f8Sesaxe ASSERT(pg != NULL);
231fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg));
232fb2f18f8Sesaxe
233fb2f18f8Sesaxe kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
234fb2f18f8Sesaxe }
235fb2f18f8Sesaxe
236fb2f18f8Sesaxe /*
2370e751525SEric Saxe * Given a hardware sharing relationship, return which dispatcher
2380e751525SEric Saxe * policies should be implemented to optimize performance and efficiency
239fb2f18f8Sesaxe */
2400e751525SEric Saxe static pg_cmt_policy_t
pg_cmt_policy(pghw_type_t hw)2410e751525SEric Saxe pg_cmt_policy(pghw_type_t hw)
242fb2f18f8Sesaxe {
2430e751525SEric Saxe pg_cmt_policy_t p;
2440e751525SEric Saxe
2450e751525SEric Saxe /*
2460e751525SEric Saxe * Give the platform a chance to override the default
2470e751525SEric Saxe */
2480e751525SEric Saxe if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
2490e751525SEric Saxe return (p);
2500e751525SEric Saxe
2510e751525SEric Saxe switch (hw) {
2520e751525SEric Saxe case PGHW_IPIPE:
2530e751525SEric Saxe case PGHW_FPU:
2548031591dSSrihari Venkatesan case PGHW_PROCNODE:
2550e751525SEric Saxe case PGHW_CHIP:
2560e751525SEric Saxe return (CMT_BALANCE);
2570e751525SEric Saxe case PGHW_CACHE:
258d3c97224SAlexander Kolbasov return (CMT_AFFINITY | CMT_BALANCE);
2590e751525SEric Saxe case PGHW_POW_ACTIVE:
2600e751525SEric Saxe case PGHW_POW_IDLE:
2610e751525SEric Saxe return (CMT_BALANCE);
2620e751525SEric Saxe default:
2630e751525SEric Saxe return (CMT_NO_POLICY);
2640e751525SEric Saxe }
2650e751525SEric Saxe }
2660e751525SEric Saxe
2670e751525SEric Saxe /*
2680e751525SEric Saxe * Rank the importance of optimizing for the pg1 relationship vs.
2690e751525SEric Saxe * the pg2 relationship.
2700e751525SEric Saxe */
2710e751525SEric Saxe static pg_cmt_t *
pg_cmt_hier_rank(pg_cmt_t * pg1,pg_cmt_t * pg2)2720e751525SEric Saxe pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
2730e751525SEric Saxe {
2740e751525SEric Saxe pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
2750e751525SEric Saxe pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
2760e751525SEric Saxe
2770e751525SEric Saxe /*
2780e751525SEric Saxe * A power domain is only important if CPUPM is enabled.
2790e751525SEric Saxe */
2800e751525SEric Saxe if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
2810e751525SEric Saxe if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
2820e751525SEric Saxe return (pg2);
2830e751525SEric Saxe if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
2840e751525SEric Saxe return (pg1);
2850e751525SEric Saxe }
2860e751525SEric Saxe
2870e751525SEric Saxe /*
2880e751525SEric Saxe * Otherwise, ask the platform
2890e751525SEric Saxe */
2900e751525SEric Saxe if (pg_plat_hw_rank(hw1, hw2) == hw1)
2910e751525SEric Saxe return (pg1);
2920e751525SEric Saxe else
2930e751525SEric Saxe return (pg2);
2940e751525SEric Saxe }
2950e751525SEric Saxe
2960e751525SEric Saxe /*
2970e751525SEric Saxe * Initialize CMT callbacks for the given PG
2980e751525SEric Saxe */
2990e751525SEric Saxe static void
cmt_callback_init(pg_t * pg)3000e751525SEric Saxe cmt_callback_init(pg_t *pg)
3010e751525SEric Saxe {
302d0e93b69SEric Saxe /*
303d0e93b69SEric Saxe * Stick with the default callbacks if there isn't going to be
304d0e93b69SEric Saxe * any CMT thread placement optimizations implemented.
305d0e93b69SEric Saxe */
306d0e93b69SEric Saxe if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY)
307d0e93b69SEric Saxe return;
308d0e93b69SEric Saxe
3090e751525SEric Saxe switch (((pghw_t *)pg)->pghw_hw) {
3100e751525SEric Saxe case PGHW_POW_ACTIVE:
3110e751525SEric Saxe pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
3120e751525SEric Saxe pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
3130e751525SEric Saxe break;
3140e751525SEric Saxe default:
3150e751525SEric Saxe pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
3160e751525SEric Saxe
3170e751525SEric Saxe }
3180e751525SEric Saxe }
3190e751525SEric Saxe
3200e751525SEric Saxe /*
3210e751525SEric Saxe * Promote PG above it's current parent.
3221a77c24bSEric Saxe * This is only legal if PG has an equal or greater number of CPUs than its
3231a77c24bSEric Saxe * parent.
3241a77c24bSEric Saxe *
3251a77c24bSEric Saxe * This routine operates on the CPU specific processor group data (for the CPUs
3261a77c24bSEric Saxe * in the PG being promoted), and may be invoked from a context where one CPU's
3271a77c24bSEric Saxe * PG data is under construction. In this case the argument "pgdata", if not
3281a77c24bSEric Saxe * NULL, is a reference to the CPU's under-construction PG data.
3290e751525SEric Saxe */
3300e751525SEric Saxe static void
cmt_hier_promote(pg_cmt_t * pg,cpu_pg_t * pgdata)3311a77c24bSEric Saxe cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
3320e751525SEric Saxe {
3330e751525SEric Saxe pg_cmt_t *parent;
3340e751525SEric Saxe group_t *children;
3350e751525SEric Saxe cpu_t *cpu;
3360e751525SEric Saxe group_iter_t iter;
3370e751525SEric Saxe pg_cpu_itr_t cpu_iter;
3380e751525SEric Saxe int r;
3390e751525SEric Saxe int err;
340b025faeeSEric Saxe int nchildren;
3410e751525SEric Saxe
3420e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock));
3430e751525SEric Saxe
3440e751525SEric Saxe parent = pg->cmt_parent;
3450e751525SEric Saxe if (parent == NULL) {
3460e751525SEric Saxe /*
3470e751525SEric Saxe * Nothing to do
3480e751525SEric Saxe */
3490e751525SEric Saxe return;
3500e751525SEric Saxe }
3510e751525SEric Saxe
3520e751525SEric Saxe ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
3530e751525SEric Saxe
3540e751525SEric Saxe /*
3550e751525SEric Saxe * We're changing around the hierarchy, which is actively traversed
3560e751525SEric Saxe * by the dispatcher. Pause CPUS to ensure exclusivity.
3570e751525SEric Saxe */
358*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL);
3590e751525SEric Saxe
3600e751525SEric Saxe /*
3610e751525SEric Saxe * If necessary, update the parent's sibling set, replacing parent
3620e751525SEric Saxe * with PG.
3630e751525SEric Saxe */
3640e751525SEric Saxe if (parent->cmt_siblings) {
3650e751525SEric Saxe if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
3660e751525SEric Saxe != -1) {
3670e751525SEric Saxe r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
3680e751525SEric Saxe ASSERT(r != -1);
3690e751525SEric Saxe }
3700e751525SEric Saxe }
3710e751525SEric Saxe
3720e751525SEric Saxe /*
3730e751525SEric Saxe * If the parent is at the top of the hierarchy, replace it's entry
3740e751525SEric Saxe * in the root lgroup's group of top level PGs.
3750e751525SEric Saxe */
3760e751525SEric Saxe if (parent->cmt_parent == NULL &&
3770e751525SEric Saxe parent->cmt_siblings != &cmt_root->cl_pgs) {
3780e751525SEric Saxe if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
3790e751525SEric Saxe != -1) {
3800e751525SEric Saxe r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
3810e751525SEric Saxe ASSERT(r != -1);
3820e751525SEric Saxe }
3830e751525SEric Saxe }
3840e751525SEric Saxe
3850e751525SEric Saxe /*
3860e751525SEric Saxe * We assume (and therefore assert) that the PG being promoted is an
3870e751525SEric Saxe * only child of it's parent. Update the parent's children set
3880e751525SEric Saxe * replacing PG's entry with the parent (since the parent is becoming
389b025faeeSEric Saxe * the child). Then have PG and the parent swap children sets and
390b025faeeSEric Saxe * children counts.
3910e751525SEric Saxe */
3920e751525SEric Saxe ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
3930e751525SEric Saxe if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
3940e751525SEric Saxe r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
3950e751525SEric Saxe ASSERT(r != -1);
3960e751525SEric Saxe }
3970e751525SEric Saxe
3980e751525SEric Saxe children = pg->cmt_children;
3990e751525SEric Saxe pg->cmt_children = parent->cmt_children;
4000e751525SEric Saxe parent->cmt_children = children;
4010e751525SEric Saxe
402b025faeeSEric Saxe nchildren = pg->cmt_nchildren;
403b025faeeSEric Saxe pg->cmt_nchildren = parent->cmt_nchildren;
404b025faeeSEric Saxe parent->cmt_nchildren = nchildren;
405b025faeeSEric Saxe
4060e751525SEric Saxe /*
4070e751525SEric Saxe * Update the sibling references for PG and it's parent
4080e751525SEric Saxe */
4090e751525SEric Saxe pg->cmt_siblings = parent->cmt_siblings;
4100e751525SEric Saxe parent->cmt_siblings = pg->cmt_children;
4110e751525SEric Saxe
4120e751525SEric Saxe /*
4130e751525SEric Saxe * Update any cached lineages in the per CPU pg data.
4140e751525SEric Saxe */
4150e751525SEric Saxe PG_CPU_ITR_INIT(pg, cpu_iter);
4160e751525SEric Saxe while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
4170e751525SEric Saxe int idx;
418b025faeeSEric Saxe int sz;
4190e751525SEric Saxe pg_cmt_t *cpu_pg;
4201a77c24bSEric Saxe cpu_pg_t *pgd; /* CPU's PG data */
4211a77c24bSEric Saxe
4221a77c24bSEric Saxe /*
4231a77c24bSEric Saxe * The CPU's whose lineage is under construction still
4241a77c24bSEric Saxe * references the bootstrap CPU PG data structure.
4251a77c24bSEric Saxe */
4261a77c24bSEric Saxe if (pg_cpu_is_bootstrapped(cpu))
4271a77c24bSEric Saxe pgd = pgdata;
4281a77c24bSEric Saxe else
4291a77c24bSEric Saxe pgd = cpu->cpu_pg;
4300e751525SEric Saxe
4310e751525SEric Saxe /*
4320e751525SEric Saxe * Iterate over the CPU's PGs updating the children
4330e751525SEric Saxe * of the PG being promoted, since they have a new parent.
4340e751525SEric Saxe */
4350e751525SEric Saxe group_iter_init(&iter);
4361a77c24bSEric Saxe while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
4370e751525SEric Saxe if (cpu_pg->cmt_parent == pg) {
4380e751525SEric Saxe cpu_pg->cmt_parent = parent;
4390e751525SEric Saxe }
4400e751525SEric Saxe }
4410e751525SEric Saxe
4420e751525SEric Saxe /*
4430e751525SEric Saxe * Update the CMT load balancing lineage
4440e751525SEric Saxe */
4451a77c24bSEric Saxe if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
4460e751525SEric Saxe /*
4470e751525SEric Saxe * Unless this is the CPU who's lineage is being
4480e751525SEric Saxe * constructed, the PG being promoted should be
4490e751525SEric Saxe * in the lineage.
4500e751525SEric Saxe */
4511a77c24bSEric Saxe ASSERT(pg_cpu_is_bootstrapped(cpu));
4520e751525SEric Saxe continue;
4530e751525SEric Saxe }
4540e751525SEric Saxe
4550e751525SEric Saxe ASSERT(idx > 0);
456b025faeeSEric Saxe ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);
4570e751525SEric Saxe
4580e751525SEric Saxe /*
4590e751525SEric Saxe * Have the child and the parent swap places in the CPU's
4600e751525SEric Saxe * lineage
4610e751525SEric Saxe */
4621a77c24bSEric Saxe group_remove_at(&pgd->cmt_pgs, idx);
4631a77c24bSEric Saxe group_remove_at(&pgd->cmt_pgs, idx - 1);
4641a77c24bSEric Saxe err = group_add_at(&pgd->cmt_pgs, parent, idx);
4650e751525SEric Saxe ASSERT(err == 0);
4661a77c24bSEric Saxe err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
4670e751525SEric Saxe ASSERT(err == 0);
468b025faeeSEric Saxe
469b025faeeSEric Saxe /*
470b025faeeSEric Saxe * Ensure cmt_lineage references CPU's leaf PG.
471b025faeeSEric Saxe * Since cmt_pgs is top-down ordered, the bottom is the last
472b025faeeSEric Saxe * element.
473b025faeeSEric Saxe */
474b025faeeSEric Saxe if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0)
475b025faeeSEric Saxe pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1);
4760e751525SEric Saxe }
4770e751525SEric Saxe
4780e751525SEric Saxe /*
4790e751525SEric Saxe * Update the parent references for PG and it's parent
4800e751525SEric Saxe */
4810e751525SEric Saxe pg->cmt_parent = parent->cmt_parent;
4820e751525SEric Saxe parent->cmt_parent = pg;
4830e751525SEric Saxe
4840e751525SEric Saxe start_cpus();
485fb2f18f8Sesaxe }
486fb2f18f8Sesaxe
487fb2f18f8Sesaxe /*
488fb2f18f8Sesaxe * CMT class callback for a new CPU entering the system
4891a77c24bSEric Saxe *
4901a77c24bSEric Saxe * This routine operates on the CPU specific processor group data (for the CPU
4911a77c24bSEric Saxe * being initialized). The argument "pgdata" is a reference to the CPU's PG
4921a77c24bSEric Saxe * data to be constructed.
4931a77c24bSEric Saxe *
4941a77c24bSEric Saxe * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
4951a77c24bSEric Saxe * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
4961a77c24bSEric Saxe * calls must be careful to operate only on the "pgdata" argument, and not
4971a77c24bSEric Saxe * cp->cpu_pg.
498fb2f18f8Sesaxe */
499fb2f18f8Sesaxe static void
pg_cmt_cpu_init(cpu_t * cp,cpu_pg_t * pgdata)5001a77c24bSEric Saxe pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
501fb2f18f8Sesaxe {
502fb2f18f8Sesaxe pg_cmt_t *pg;
503fb2f18f8Sesaxe group_t *cmt_pgs;
5040e751525SEric Saxe int levels, level;
505fb2f18f8Sesaxe pghw_type_t hw;
506fb2f18f8Sesaxe pg_t *pg_cache = NULL;
507fb2f18f8Sesaxe pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS];
508fb2f18f8Sesaxe lgrp_handle_t lgrp_handle;
509fb2f18f8Sesaxe cmt_lgrp_t *lgrp;
510ef4f35d8SEric Saxe cmt_lineage_validation_t lineage_status;
511fb2f18f8Sesaxe
512fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock));
5131a77c24bSEric Saxe ASSERT(pg_cpu_is_bootstrapped(cp));
514fb2f18f8Sesaxe
5150e751525SEric Saxe if (cmt_sched_disabled)
5160e751525SEric Saxe return;
5170e751525SEric Saxe
518fb2f18f8Sesaxe /*
519fb2f18f8Sesaxe * A new CPU is coming into the system.
520fb2f18f8Sesaxe * Interrogate the platform to see if the CPU
5210e751525SEric Saxe * has any performance or efficiency relevant
5220e751525SEric Saxe * sharing relationships
523fb2f18f8Sesaxe */
5241a77c24bSEric Saxe cmt_pgs = &pgdata->cmt_pgs;
5251a77c24bSEric Saxe pgdata->cmt_lineage = NULL;
526fb2f18f8Sesaxe
527fb2f18f8Sesaxe bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
5280e751525SEric Saxe levels = 0;
529fb2f18f8Sesaxe for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
530fb2f18f8Sesaxe
5310e751525SEric Saxe pg_cmt_policy_t policy;
5320e751525SEric Saxe
533fb2f18f8Sesaxe /*
5340e751525SEric Saxe * We're only interested in the hw sharing relationships
5350e751525SEric Saxe * for which we know how to optimize.
536fb2f18f8Sesaxe */
5370e751525SEric Saxe policy = pg_cmt_policy(hw);
5380e751525SEric Saxe if (policy == CMT_NO_POLICY ||
5390e751525SEric Saxe pg_plat_hw_shared(cp, hw) == 0)
540fb2f18f8Sesaxe continue;
541fb2f18f8Sesaxe
542fb2f18f8Sesaxe /*
543d0e93b69SEric Saxe * We will still create the PGs for hardware sharing
544d0e93b69SEric Saxe * relationships that have been blacklisted, but won't
545d0e93b69SEric Saxe * implement CMT thread placement optimizations against them.
5460e751525SEric Saxe */
547d0e93b69SEric Saxe if (cmt_hw_blacklisted[hw] == 1)
548d0e93b69SEric Saxe policy = CMT_NO_POLICY;
5490e751525SEric Saxe
5500e751525SEric Saxe /*
551fb2f18f8Sesaxe * Find (or create) the PG associated with
552fb2f18f8Sesaxe * the hw sharing relationship in which cp
553fb2f18f8Sesaxe * belongs.
554fb2f18f8Sesaxe *
555fb2f18f8Sesaxe * Determine if a suitable PG already
556fb2f18f8Sesaxe * exists, or if one needs to be created.
557fb2f18f8Sesaxe */
558fb2f18f8Sesaxe pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
559fb2f18f8Sesaxe if (pg == NULL) {
560fb2f18f8Sesaxe /*
561fb2f18f8Sesaxe * Create a new one.
562fb2f18f8Sesaxe * Initialize the common...
563fb2f18f8Sesaxe */
564fb2f18f8Sesaxe pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
565fb2f18f8Sesaxe
566fb2f18f8Sesaxe /* ... physical ... */
567fb2f18f8Sesaxe pghw_init((pghw_t *)pg, cp, hw);
568fb2f18f8Sesaxe
569fb2f18f8Sesaxe /*
570fb2f18f8Sesaxe * ... and CMT specific portions of the
571fb2f18f8Sesaxe * structure.
572fb2f18f8Sesaxe */
5730e751525SEric Saxe pg->cmt_policy = policy;
5740e751525SEric Saxe
5750e751525SEric Saxe /* CMT event callbacks */
5760e751525SEric Saxe cmt_callback_init((pg_t *)pg);
5770e751525SEric Saxe
578fb2f18f8Sesaxe bitset_init(&pg->cmt_cpus_actv_set);
579fb2f18f8Sesaxe group_create(&pg->cmt_cpus_actv);
580fb2f18f8Sesaxe } else {
581fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg));
582fb2f18f8Sesaxe }
583fb2f18f8Sesaxe
584b885580bSAlexander Kolbasov ((pghw_t *)pg)->pghw_generation++;
585b885580bSAlexander Kolbasov
586fb2f18f8Sesaxe /* Add the CPU to the PG */
5871a77c24bSEric Saxe pg_cpu_add((pg_t *)pg, cp, pgdata);
588fb2f18f8Sesaxe
589fb2f18f8Sesaxe /*
5906890d023SEric Saxe * Ensure capacity of the active CPU group/bitset
591fb2f18f8Sesaxe */
592fb2f18f8Sesaxe group_expand(&pg->cmt_cpus_actv,
593fb2f18f8Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
594fb2f18f8Sesaxe
595fb2f18f8Sesaxe if (cp->cpu_seqid >=
596fb2f18f8Sesaxe bitset_capacity(&pg->cmt_cpus_actv_set)) {
597fb2f18f8Sesaxe bitset_resize(&pg->cmt_cpus_actv_set,
598fb2f18f8Sesaxe cp->cpu_seqid + 1);
599fb2f18f8Sesaxe }
600fb2f18f8Sesaxe
601fb2f18f8Sesaxe /*
6020e751525SEric Saxe * Build a lineage of CMT PGs for load balancing / coalescence
603fb2f18f8Sesaxe */
6040e751525SEric Saxe if (policy & (CMT_BALANCE | CMT_COALESCE)) {
6050e751525SEric Saxe cpu_cmt_hier[levels++] = pg;
606fb2f18f8Sesaxe }
607fb2f18f8Sesaxe
608fb2f18f8Sesaxe /* Cache this for later */
609fb2f18f8Sesaxe if (hw == PGHW_CACHE)
610fb2f18f8Sesaxe pg_cache = (pg_t *)pg;
611fb2f18f8Sesaxe }
612fb2f18f8Sesaxe
6130e751525SEric Saxe group_expand(cmt_pgs, levels);
6146890d023SEric Saxe
6156890d023SEric Saxe if (cmt_root == NULL)
6166890d023SEric Saxe cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
617fb2f18f8Sesaxe
618fb2f18f8Sesaxe /*
6190e751525SEric Saxe * Find the lgrp that encapsulates this CPU's CMT hierarchy
6206890d023SEric Saxe */
6216890d023SEric Saxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
6226890d023SEric Saxe if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
6236890d023SEric Saxe lgrp = pg_cmt_lgrp_create(lgrp_handle);
6246890d023SEric Saxe
6256890d023SEric Saxe /*
6260e751525SEric Saxe * Ascendingly sort the PGs in the lineage by number of CPUs
6270e751525SEric Saxe */
6280e751525SEric Saxe pg_cmt_hier_sort(cpu_cmt_hier, levels);
6290e751525SEric Saxe
6300e751525SEric Saxe /*
6310e751525SEric Saxe * Examine the lineage and validate it.
6320e751525SEric Saxe * This routine will also try to fix the lineage along with the
6330e751525SEric Saxe * rest of the PG hierarchy should it detect an issue.
6340e751525SEric Saxe *
635ef4f35d8SEric Saxe * If it returns anything other than VALID or REPAIRED, an
636ef4f35d8SEric Saxe * unrecoverable error has occurred, and we cannot proceed.
6370e751525SEric Saxe */
6381a77c24bSEric Saxe lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
639ef4f35d8SEric Saxe if ((lineage_status != CMT_LINEAGE_VALID) &&
6401a77c24bSEric Saxe (lineage_status != CMT_LINEAGE_REPAIRED)) {
6411a77c24bSEric Saxe /*
6421a77c24bSEric Saxe * In the case of an unrecoverable error where CMT scheduling
6431a77c24bSEric Saxe * has been disabled, assert that the under construction CPU's
6441a77c24bSEric Saxe * PG data has an empty CMT load balancing lineage.
6451a77c24bSEric Saxe */
6461a77c24bSEric Saxe ASSERT((cmt_sched_disabled == 0) ||
6471a77c24bSEric Saxe (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
6480e751525SEric Saxe return;
6491a77c24bSEric Saxe }
6500e751525SEric Saxe
6510e751525SEric Saxe /*
6520e751525SEric Saxe * For existing PGs in the lineage, verify that the parent is
6530e751525SEric Saxe * correct, as the generation in the lineage may have changed
6540e751525SEric Saxe * as a result of the sorting. Start the traversal at the top
6550e751525SEric Saxe * of the lineage, moving down.
6560e751525SEric Saxe */
6570e751525SEric Saxe for (level = levels - 1; level >= 0; ) {
6580e751525SEric Saxe int reorg;
6590e751525SEric Saxe
6600e751525SEric Saxe reorg = 0;
6610e751525SEric Saxe pg = cpu_cmt_hier[level];
6620e751525SEric Saxe
6630e751525SEric Saxe /*
6640e751525SEric Saxe * Promote PGs at an incorrect generation into place.
6650e751525SEric Saxe */
6660e751525SEric Saxe while (pg->cmt_parent &&
6670e751525SEric Saxe pg->cmt_parent != cpu_cmt_hier[level + 1]) {
6681a77c24bSEric Saxe cmt_hier_promote(pg, pgdata);
6690e751525SEric Saxe reorg++;
6700e751525SEric Saxe }
6710e751525SEric Saxe if (reorg > 0)
6720e751525SEric Saxe level = levels - 1;
6730e751525SEric Saxe else
6740e751525SEric Saxe level--;
6750e751525SEric Saxe }
6760e751525SEric Saxe
6770e751525SEric Saxe /*
6786890d023SEric Saxe * For each of the PGs in the CPU's lineage:
6790e751525SEric Saxe * - Add an entry in the CPU sorted CMT PG group
6800e751525SEric Saxe * which is used for top down CMT load balancing
681fb2f18f8Sesaxe * - Tie the PG into the CMT hierarchy by connecting
682fb2f18f8Sesaxe * it to it's parent and siblings.
683fb2f18f8Sesaxe */
6840e751525SEric Saxe for (level = 0; level < levels; level++) {
685fb2f18f8Sesaxe uint_t children;
686fb2f18f8Sesaxe int err;
687fb2f18f8Sesaxe
688fb2f18f8Sesaxe pg = cpu_cmt_hier[level];
6890e751525SEric Saxe err = group_add_at(cmt_pgs, pg, levels - level - 1);
690fb2f18f8Sesaxe ASSERT(err == 0);
691fb2f18f8Sesaxe
692fb2f18f8Sesaxe if (level == 0)
6931a77c24bSEric Saxe pgdata->cmt_lineage = (pg_t *)pg;
694fb2f18f8Sesaxe
695fb2f18f8Sesaxe if (pg->cmt_siblings != NULL) {
696fb2f18f8Sesaxe /* Already initialized */
697fb2f18f8Sesaxe ASSERT(pg->cmt_parent == NULL ||
698fb2f18f8Sesaxe pg->cmt_parent == cpu_cmt_hier[level + 1]);
699fb2f18f8Sesaxe ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
700c416da2dSjb145095 ((pg->cmt_parent != NULL) &&
701c416da2dSjb145095 pg->cmt_siblings == pg->cmt_parent->cmt_children));
702fb2f18f8Sesaxe continue;
703fb2f18f8Sesaxe }
704fb2f18f8Sesaxe
7050e751525SEric Saxe if ((level + 1) == levels) {
706fb2f18f8Sesaxe pg->cmt_parent = NULL;
7076890d023SEric Saxe
708fb2f18f8Sesaxe pg->cmt_siblings = &lgrp->cl_pgs;
709fb2f18f8Sesaxe children = ++lgrp->cl_npgs;
7100e751525SEric Saxe if (cmt_root != lgrp)
7116890d023SEric Saxe cmt_root->cl_npgs++;
712fb2f18f8Sesaxe } else {
713fb2f18f8Sesaxe pg->cmt_parent = cpu_cmt_hier[level + 1];
714fb2f18f8Sesaxe
715fb2f18f8Sesaxe /*
716fb2f18f8Sesaxe * A good parent keeps track of their children.
717fb2f18f8Sesaxe * The parent's children group is also the PG's
718fb2f18f8Sesaxe * siblings.
719fb2f18f8Sesaxe */
720fb2f18f8Sesaxe if (pg->cmt_parent->cmt_children == NULL) {
721fb2f18f8Sesaxe pg->cmt_parent->cmt_children =
722fb2f18f8Sesaxe kmem_zalloc(sizeof (group_t), KM_SLEEP);
723fb2f18f8Sesaxe group_create(pg->cmt_parent->cmt_children);
724fb2f18f8Sesaxe }
725fb2f18f8Sesaxe pg->cmt_siblings = pg->cmt_parent->cmt_children;
726fb2f18f8Sesaxe children = ++pg->cmt_parent->cmt_nchildren;
727fb2f18f8Sesaxe }
7286890d023SEric Saxe
729fb2f18f8Sesaxe group_expand(pg->cmt_siblings, children);
7306890d023SEric Saxe group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
731fb2f18f8Sesaxe }
732fb2f18f8Sesaxe
733fb2f18f8Sesaxe /*
734fb2f18f8Sesaxe * Cache the chip and core IDs in the cpu_t->cpu_physid structure
735fb2f18f8Sesaxe * for fast lookups later.
736fb2f18f8Sesaxe */
737fb2f18f8Sesaxe if (cp->cpu_physid) {
738fb2f18f8Sesaxe cp->cpu_physid->cpu_chipid =
739fb2f18f8Sesaxe pg_plat_hw_instance_id(cp, PGHW_CHIP);
740fb2f18f8Sesaxe cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
741fb2f18f8Sesaxe
742fb2f18f8Sesaxe /*
743fb2f18f8Sesaxe * If this cpu has a PG representing shared cache, then set
744fb2f18f8Sesaxe * cpu_cacheid to that PG's logical id
745fb2f18f8Sesaxe */
746fb2f18f8Sesaxe if (pg_cache)
747fb2f18f8Sesaxe cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
748fb2f18f8Sesaxe }
749fb2f18f8Sesaxe
750fb2f18f8Sesaxe /* CPU0 only initialization */
751fb2f18f8Sesaxe if (is_cpu0) {
752fb2f18f8Sesaxe is_cpu0 = 0;
753a6604450Sesaxe cpu0_lgrp = lgrp;
754fb2f18f8Sesaxe }
755fb2f18f8Sesaxe
756fb2f18f8Sesaxe }
757fb2f18f8Sesaxe
758fb2f18f8Sesaxe /*
759fb2f18f8Sesaxe * Class callback when a CPU is leaving the system (deletion)
7601a77c24bSEric Saxe *
7611a77c24bSEric Saxe * "pgdata" is a reference to the CPU's PG data to be deconstructed.
7621a77c24bSEric Saxe *
7631a77c24bSEric Saxe * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
7641a77c24bSEric Saxe * references a "bootstrap" structure across this function's invocation.
765b885580bSAlexander Kolbasov * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
7661a77c24bSEric Saxe * on the "pgdata" argument, and not cp->cpu_pg.
767fb2f18f8Sesaxe */
768fb2f18f8Sesaxe static void
pg_cmt_cpu_fini(cpu_t * cp,cpu_pg_t * pgdata)7691a77c24bSEric Saxe pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
770fb2f18f8Sesaxe {
771fb2f18f8Sesaxe group_iter_t i;
772fb2f18f8Sesaxe pg_cmt_t *pg;
773fb2f18f8Sesaxe group_t *pgs, *cmt_pgs;
774fb2f18f8Sesaxe lgrp_handle_t lgrp_handle;
775fb2f18f8Sesaxe cmt_lgrp_t *lgrp;
776fb2f18f8Sesaxe
7770e751525SEric Saxe if (cmt_sched_disabled)
7780e751525SEric Saxe return;
7790e751525SEric Saxe
7801a77c24bSEric Saxe ASSERT(pg_cpu_is_bootstrapped(cp));
7811a77c24bSEric Saxe
7821a77c24bSEric Saxe pgs = &pgdata->pgs;
7831a77c24bSEric Saxe cmt_pgs = &pgdata->cmt_pgs;
784fb2f18f8Sesaxe
785fb2f18f8Sesaxe /*
786fb2f18f8Sesaxe * Find the lgroup that encapsulates this CPU's CMT hierarchy
787fb2f18f8Sesaxe */
788fb2f18f8Sesaxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
789a6604450Sesaxe
790fb2f18f8Sesaxe lgrp = pg_cmt_find_lgrp(lgrp_handle);
7913e81cacfSEric Saxe if (ncpus == 1 && lgrp != cpu0_lgrp) {
792a6604450Sesaxe /*
7933e81cacfSEric Saxe * One might wonder how we could be deconfiguring the
7943e81cacfSEric Saxe * only CPU in the system.
795a6604450Sesaxe *
7963e81cacfSEric Saxe * On Starcat systems when null_proc_lpa is detected,
7973e81cacfSEric Saxe * the boot CPU (which is already configured into a leaf
7983e81cacfSEric Saxe * lgroup), is moved into the root lgroup. This is done by
7993e81cacfSEric Saxe * deconfiguring it from both lgroups and processor
8003e81cacfSEric Saxe * groups), and then later reconfiguring it back in. This
8013e81cacfSEric Saxe * call to pg_cmt_cpu_fini() is part of that deconfiguration.
8023e81cacfSEric Saxe *
8033e81cacfSEric Saxe * This special case is detected by noting that the platform
8043e81cacfSEric Saxe * has changed the CPU's lgrp affiliation (since it now
8053e81cacfSEric Saxe * belongs in the root). In this case, use the cmt_lgrp_t
8063e81cacfSEric Saxe * cached for the boot CPU, since this is what needs to be
8073e81cacfSEric Saxe * torn down.
808a6604450Sesaxe */
809a6604450Sesaxe lgrp = cpu0_lgrp;
810a6604450Sesaxe }
811fb2f18f8Sesaxe
8123e81cacfSEric Saxe ASSERT(lgrp != NULL);
8133e81cacfSEric Saxe
814fb2f18f8Sesaxe /*
815fb2f18f8Sesaxe * First, clean up anything load balancing specific for each of
816fb2f18f8Sesaxe * the CPU's PGs that participated in CMT load balancing
817fb2f18f8Sesaxe */
8181a77c24bSEric Saxe pg = (pg_cmt_t *)pgdata->cmt_lineage;
819fb2f18f8Sesaxe while (pg != NULL) {
820fb2f18f8Sesaxe
821b885580bSAlexander Kolbasov ((pghw_t *)pg)->pghw_generation++;
822b885580bSAlexander Kolbasov
823fb2f18f8Sesaxe /*
824fb2f18f8Sesaxe * Remove the PG from the CPU's load balancing lineage
825fb2f18f8Sesaxe */
826fb2f18f8Sesaxe (void) group_remove(cmt_pgs, pg, GRP_RESIZE);
827fb2f18f8Sesaxe
828fb2f18f8Sesaxe /*
829fb2f18f8Sesaxe * If it's about to become empty, destroy it's children
830fb2f18f8Sesaxe * group, and remove it's reference from it's siblings.
831fb2f18f8Sesaxe * This is done here (rather than below) to avoid removing
832fb2f18f8Sesaxe * our reference from a PG that we just eliminated.
833fb2f18f8Sesaxe */
834fb2f18f8Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
835fb2f18f8Sesaxe if (pg->cmt_children != NULL)
836fb2f18f8Sesaxe group_destroy(pg->cmt_children);
837fb2f18f8Sesaxe if (pg->cmt_siblings != NULL) {
838fb2f18f8Sesaxe if (pg->cmt_siblings == &lgrp->cl_pgs)
839fb2f18f8Sesaxe lgrp->cl_npgs--;
840fb2f18f8Sesaxe else
841fb2f18f8Sesaxe pg->cmt_parent->cmt_nchildren--;
842fb2f18f8Sesaxe }
843fb2f18f8Sesaxe }
844fb2f18f8Sesaxe pg = pg->cmt_parent;
845fb2f18f8Sesaxe }
846fb2f18f8Sesaxe ASSERT(GROUP_SIZE(cmt_pgs) == 0);
847fb2f18f8Sesaxe
848fb2f18f8Sesaxe /*
849fb2f18f8Sesaxe * Now that the load balancing lineage updates have happened,
850fb2f18f8Sesaxe * remove the CPU from all it's PGs (destroying any that become
851fb2f18f8Sesaxe * empty).
852fb2f18f8Sesaxe */
853fb2f18f8Sesaxe group_iter_init(&i);
854fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) {
855fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0)
856fb2f18f8Sesaxe continue;
857fb2f18f8Sesaxe
8581a77c24bSEric Saxe pg_cpu_delete((pg_t *)pg, cp, pgdata);
859fb2f18f8Sesaxe /*
860fb2f18f8Sesaxe * Deleting the CPU from the PG changes the CPU's
861fb2f18f8Sesaxe * PG group over which we are actively iterating
862fb2f18f8Sesaxe * Re-initialize the iteration
863fb2f18f8Sesaxe */
864fb2f18f8Sesaxe group_iter_init(&i);
865fb2f18f8Sesaxe
866fb2f18f8Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
867fb2f18f8Sesaxe
868fb2f18f8Sesaxe /*
869fb2f18f8Sesaxe * The PG has become zero sized, so destroy it.
870fb2f18f8Sesaxe */
871fb2f18f8Sesaxe group_destroy(&pg->cmt_cpus_actv);
872fb2f18f8Sesaxe bitset_fini(&pg->cmt_cpus_actv_set);
873fb2f18f8Sesaxe pghw_fini((pghw_t *)pg);
874fb2f18f8Sesaxe
875fb2f18f8Sesaxe pg_destroy((pg_t *)pg);
876fb2f18f8Sesaxe }
877fb2f18f8Sesaxe }
878fb2f18f8Sesaxe }
879fb2f18f8Sesaxe
880fb2f18f8Sesaxe /*
881fb2f18f8Sesaxe * Class callback when a CPU is entering a cpu partition
882fb2f18f8Sesaxe */
883fb2f18f8Sesaxe static void
pg_cmt_cpupart_in(cpu_t * cp,cpupart_t * pp)884fb2f18f8Sesaxe pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
885fb2f18f8Sesaxe {
886fb2f18f8Sesaxe group_t *pgs;
887fb2f18f8Sesaxe pg_t *pg;
888fb2f18f8Sesaxe group_iter_t i;
889fb2f18f8Sesaxe
890fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock));
891fb2f18f8Sesaxe
8920e751525SEric Saxe if (cmt_sched_disabled)
8930e751525SEric Saxe return;
8940e751525SEric Saxe
895fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs;
896fb2f18f8Sesaxe
897fb2f18f8Sesaxe /*
898fb2f18f8Sesaxe * Ensure that the new partition's PG bitset
899fb2f18f8Sesaxe * is large enough for all CMT PG's to which cp
900fb2f18f8Sesaxe * belongs
901fb2f18f8Sesaxe */
902fb2f18f8Sesaxe group_iter_init(&i);
903fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) {
904fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0)
905fb2f18f8Sesaxe continue;
906fb2f18f8Sesaxe
907fb2f18f8Sesaxe if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
908fb2f18f8Sesaxe bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
909fb2f18f8Sesaxe }
910fb2f18f8Sesaxe }
911fb2f18f8Sesaxe
912fb2f18f8Sesaxe /*
913fb2f18f8Sesaxe * Class callback when a CPU is actually moving partitions
914fb2f18f8Sesaxe */
915fb2f18f8Sesaxe static void
pg_cmt_cpupart_move(cpu_t * cp,cpupart_t * oldpp,cpupart_t * newpp)916fb2f18f8Sesaxe pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
917fb2f18f8Sesaxe {
918fb2f18f8Sesaxe cpu_t *cpp;
919fb2f18f8Sesaxe group_t *pgs;
920fb2f18f8Sesaxe pg_t *pg;
921fb2f18f8Sesaxe group_iter_t pg_iter;
922fb2f18f8Sesaxe pg_cpu_itr_t cpu_iter;
923fb2f18f8Sesaxe boolean_t found;
924fb2f18f8Sesaxe
925fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock));
926fb2f18f8Sesaxe
9270e751525SEric Saxe if (cmt_sched_disabled)
9280e751525SEric Saxe return;
9290e751525SEric Saxe
930fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs;
931fb2f18f8Sesaxe group_iter_init(&pg_iter);
932fb2f18f8Sesaxe
933fb2f18f8Sesaxe /*
934fb2f18f8Sesaxe * Iterate over the CPUs CMT PGs
935fb2f18f8Sesaxe */
936fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
937fb2f18f8Sesaxe
938fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0)
939fb2f18f8Sesaxe continue;
940fb2f18f8Sesaxe
941fb2f18f8Sesaxe /*
942fb2f18f8Sesaxe * Add the PG to the bitset in the new partition.
943fb2f18f8Sesaxe */
944fb2f18f8Sesaxe bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
945fb2f18f8Sesaxe
946fb2f18f8Sesaxe /*
947fb2f18f8Sesaxe * Remove the PG from the bitset in the old partition
948fb2f18f8Sesaxe * if the last of the PG's CPUs have left.
949fb2f18f8Sesaxe */
950fb2f18f8Sesaxe found = B_FALSE;
951fb2f18f8Sesaxe PG_CPU_ITR_INIT(pg, cpu_iter);
952fb2f18f8Sesaxe while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
953fb2f18f8Sesaxe if (cpp == cp)
954fb2f18f8Sesaxe continue;
955a6604450Sesaxe if (CPU_ACTIVE(cpp) &&
956a6604450Sesaxe cpp->cpu_part->cp_id == oldpp->cp_id) {
957fb2f18f8Sesaxe found = B_TRUE;
958fb2f18f8Sesaxe break;
959fb2f18f8Sesaxe }
960fb2f18f8Sesaxe }
961fb2f18f8Sesaxe if (!found)
962fb2f18f8Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
963fb2f18f8Sesaxe }
964fb2f18f8Sesaxe }
965fb2f18f8Sesaxe
966fb2f18f8Sesaxe /*
967fb2f18f8Sesaxe * Class callback when a CPU becomes active (online)
968fb2f18f8Sesaxe *
969fb2f18f8Sesaxe * This is called in a context where CPUs are paused
970fb2f18f8Sesaxe */
971fb2f18f8Sesaxe static void
pg_cmt_cpu_active(cpu_t * cp)972fb2f18f8Sesaxe pg_cmt_cpu_active(cpu_t *cp)
973fb2f18f8Sesaxe {
974fb2f18f8Sesaxe int err;
975fb2f18f8Sesaxe group_iter_t i;
976fb2f18f8Sesaxe pg_cmt_t *pg;
977fb2f18f8Sesaxe group_t *pgs;
978fb2f18f8Sesaxe
979fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock));
980fb2f18f8Sesaxe
9810e751525SEric Saxe if (cmt_sched_disabled)
9820e751525SEric Saxe return;
9830e751525SEric Saxe
984fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs;
985fb2f18f8Sesaxe group_iter_init(&i);
986fb2f18f8Sesaxe
987fb2f18f8Sesaxe /*
988fb2f18f8Sesaxe * Iterate over the CPU's PGs
989fb2f18f8Sesaxe */
990fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) {
991fb2f18f8Sesaxe
992fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0)
993fb2f18f8Sesaxe continue;
994fb2f18f8Sesaxe
995b885580bSAlexander Kolbasov /*
996b885580bSAlexander Kolbasov * Move to the next generation since topology is changing
997b885580bSAlexander Kolbasov */
998b885580bSAlexander Kolbasov ((pghw_t *)pg)->pghw_generation++;
999b885580bSAlexander Kolbasov
1000fb2f18f8Sesaxe err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1001fb2f18f8Sesaxe ASSERT(err == 0);
1002fb2f18f8Sesaxe
1003fb2f18f8Sesaxe /*
1004fb2f18f8Sesaxe * If this is the first active CPU in the PG, and it
1005fb2f18f8Sesaxe * represents a hardware sharing relationship over which
1006fb2f18f8Sesaxe * CMT load balancing is performed, add it as a candidate
1007fb2f18f8Sesaxe * for balancing with it's siblings.
1008fb2f18f8Sesaxe */
1009fb2f18f8Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
10100e751525SEric Saxe (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1011fb2f18f8Sesaxe err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
1012fb2f18f8Sesaxe ASSERT(err == 0);
10136890d023SEric Saxe
10146890d023SEric Saxe /*
10156890d023SEric Saxe * If this is a top level PG, add it as a balancing
10160e751525SEric Saxe * candidate when balancing within the root lgroup.
10176890d023SEric Saxe */
10180e751525SEric Saxe if (pg->cmt_parent == NULL &&
10190e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) {
10206890d023SEric Saxe err = group_add(&cmt_root->cl_pgs, pg,
10216890d023SEric Saxe GRP_NORESIZE);
10226890d023SEric Saxe ASSERT(err == 0);
10236890d023SEric Saxe }
1024fb2f18f8Sesaxe }
1025fb2f18f8Sesaxe
1026fb2f18f8Sesaxe /*
1027fb2f18f8Sesaxe * Notate the CPU in the PGs active CPU bitset.
1028fb2f18f8Sesaxe * Also notate the PG as being active in it's associated
1029fb2f18f8Sesaxe * partition
1030fb2f18f8Sesaxe */
1031fb2f18f8Sesaxe bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1032fb2f18f8Sesaxe bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
1033fb2f18f8Sesaxe }
1034fb2f18f8Sesaxe }
1035fb2f18f8Sesaxe
1036fb2f18f8Sesaxe /*
1037fb2f18f8Sesaxe * Class callback when a CPU goes inactive (offline)
1038fb2f18f8Sesaxe *
1039fb2f18f8Sesaxe * This is called in a context where CPUs are paused
1040fb2f18f8Sesaxe */
1041fb2f18f8Sesaxe static void
pg_cmt_cpu_inactive(cpu_t * cp)1042fb2f18f8Sesaxe pg_cmt_cpu_inactive(cpu_t *cp)
1043fb2f18f8Sesaxe {
1044fb2f18f8Sesaxe int err;
1045fb2f18f8Sesaxe group_t *pgs;
1046fb2f18f8Sesaxe pg_cmt_t *pg;
1047fb2f18f8Sesaxe cpu_t *cpp;
1048fb2f18f8Sesaxe group_iter_t i;
1049fb2f18f8Sesaxe pg_cpu_itr_t cpu_itr;
1050fb2f18f8Sesaxe boolean_t found;
1051fb2f18f8Sesaxe
1052fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock));
1053fb2f18f8Sesaxe
10540e751525SEric Saxe if (cmt_sched_disabled)
10550e751525SEric Saxe return;
10560e751525SEric Saxe
1057fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs;
1058fb2f18f8Sesaxe group_iter_init(&i);
1059fb2f18f8Sesaxe
1060fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) {
1061fb2f18f8Sesaxe
1062fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0)
1063fb2f18f8Sesaxe continue;
1064fb2f18f8Sesaxe
1065fb2f18f8Sesaxe /*
1066b885580bSAlexander Kolbasov * Move to the next generation since topology is changing
1067b885580bSAlexander Kolbasov */
1068b885580bSAlexander Kolbasov ((pghw_t *)pg)->pghw_generation++;
1069b885580bSAlexander Kolbasov
1070b885580bSAlexander Kolbasov /*
1071fb2f18f8Sesaxe * Remove the CPU from the CMT PGs active CPU group
1072fb2f18f8Sesaxe * bitmap
1073fb2f18f8Sesaxe */
1074fb2f18f8Sesaxe err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1075fb2f18f8Sesaxe ASSERT(err == 0);
1076fb2f18f8Sesaxe
1077fb2f18f8Sesaxe bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1078fb2f18f8Sesaxe
1079fb2f18f8Sesaxe /*
1080fb2f18f8Sesaxe * If there are no more active CPUs in this PG over which
1081fb2f18f8Sesaxe * load was balanced, remove it as a balancing candidate.
1082fb2f18f8Sesaxe */
1083fb2f18f8Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
10840e751525SEric Saxe (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1085fb2f18f8Sesaxe err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1086fb2f18f8Sesaxe ASSERT(err == 0);
10876890d023SEric Saxe
10880e751525SEric Saxe if (pg->cmt_parent == NULL &&
10890e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) {
10906890d023SEric Saxe err = group_remove(&cmt_root->cl_pgs, pg,
10916890d023SEric Saxe GRP_NORESIZE);
10926890d023SEric Saxe ASSERT(err == 0);
10936890d023SEric Saxe }
1094fb2f18f8Sesaxe }
1095fb2f18f8Sesaxe
1096fb2f18f8Sesaxe /*
1097fb2f18f8Sesaxe * Assert the number of active CPUs does not exceed
1098fb2f18f8Sesaxe * the total number of CPUs in the PG
1099fb2f18f8Sesaxe */
1100fb2f18f8Sesaxe ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1101fb2f18f8Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1102fb2f18f8Sesaxe
1103fb2f18f8Sesaxe /*
1104fb2f18f8Sesaxe * Update the PG bitset in the CPU's old partition
1105fb2f18f8Sesaxe */
1106fb2f18f8Sesaxe found = B_FALSE;
1107fb2f18f8Sesaxe PG_CPU_ITR_INIT(pg, cpu_itr);
1108fb2f18f8Sesaxe while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1109fb2f18f8Sesaxe if (cpp == cp)
1110fb2f18f8Sesaxe continue;
1111a6604450Sesaxe if (CPU_ACTIVE(cpp) &&
1112a6604450Sesaxe cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1113fb2f18f8Sesaxe found = B_TRUE;
1114fb2f18f8Sesaxe break;
1115fb2f18f8Sesaxe }
1116fb2f18f8Sesaxe }
1117fb2f18f8Sesaxe if (!found) {
1118fb2f18f8Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs,
1119fb2f18f8Sesaxe ((pg_t *)pg)->pg_id);
1120fb2f18f8Sesaxe }
1121fb2f18f8Sesaxe }
1122fb2f18f8Sesaxe }
1123fb2f18f8Sesaxe
1124fb2f18f8Sesaxe /*
1125fb2f18f8Sesaxe * Return non-zero if the CPU belongs in the given PG
1126fb2f18f8Sesaxe */
1127fb2f18f8Sesaxe static int
pg_cmt_cpu_belongs(pg_t * pg,cpu_t * cp)1128fb2f18f8Sesaxe pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1129fb2f18f8Sesaxe {
1130fb2f18f8Sesaxe cpu_t *pg_cpu;
1131fb2f18f8Sesaxe
1132fb2f18f8Sesaxe pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1133fb2f18f8Sesaxe
1134fb2f18f8Sesaxe ASSERT(pg_cpu != NULL);
1135fb2f18f8Sesaxe
1136fb2f18f8Sesaxe /*
1137fb2f18f8Sesaxe * The CPU belongs if, given the nature of the hardware sharing
1138fb2f18f8Sesaxe * relationship represented by the PG, the CPU has that
1139fb2f18f8Sesaxe * relationship with some other CPU already in the PG
1140fb2f18f8Sesaxe */
1141fb2f18f8Sesaxe if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1142fb2f18f8Sesaxe return (1);
1143fb2f18f8Sesaxe
1144fb2f18f8Sesaxe return (0);
1145fb2f18f8Sesaxe }
1146fb2f18f8Sesaxe
1147fb2f18f8Sesaxe /*
11480e751525SEric Saxe * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1149fb2f18f8Sesaxe */
1150fb2f18f8Sesaxe static void
pg_cmt_hier_sort(pg_cmt_t ** hier,int size)11510e751525SEric Saxe pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1152fb2f18f8Sesaxe {
11538031591dSSrihari Venkatesan int i, j, inc, sz;
11548031591dSSrihari Venkatesan int start, end;
11550e751525SEric Saxe pg_t *tmp;
11560e751525SEric Saxe pg_t **h = (pg_t **)hier;
1157fb2f18f8Sesaxe
11580e751525SEric Saxe /*
11590e751525SEric Saxe * First sort by number of CPUs
11600e751525SEric Saxe */
11610e751525SEric Saxe inc = size / 2;
11620e751525SEric Saxe while (inc > 0) {
11630e751525SEric Saxe for (i = inc; i < size; i++) {
11640e751525SEric Saxe j = i;
11650e751525SEric Saxe tmp = h[i];
11660e751525SEric Saxe while ((j >= inc) &&
11670e751525SEric Saxe (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
11680e751525SEric Saxe h[j] = h[j - inc];
11690e751525SEric Saxe j = j - inc;
11700e751525SEric Saxe }
11710e751525SEric Saxe h[j] = tmp;
11720e751525SEric Saxe }
11730e751525SEric Saxe if (inc == 2)
11740e751525SEric Saxe inc = 1;
11750e751525SEric Saxe else
11760e751525SEric Saxe inc = (inc * 5) / 11;
11770e751525SEric Saxe }
1178fb2f18f8Sesaxe
11790e751525SEric Saxe /*
11800e751525SEric Saxe * Break ties by asking the platform.
11810e751525SEric Saxe * Determine if h[i] outranks h[i + 1] and if so, swap them.
11820e751525SEric Saxe */
11838031591dSSrihari Venkatesan for (start = 0; start < size; start++) {
11848031591dSSrihari Venkatesan
11858031591dSSrihari Venkatesan /*
11868031591dSSrihari Venkatesan * Find various contiguous sets of elements,
11878031591dSSrihari Venkatesan * in the array, with the same number of cpus
11888031591dSSrihari Venkatesan */
11898031591dSSrihari Venkatesan end = start;
11908031591dSSrihari Venkatesan sz = PG_NUM_CPUS(h[start]);
11918031591dSSrihari Venkatesan while ((end < size) && (sz == PG_NUM_CPUS(h[end])))
11928031591dSSrihari Venkatesan end++;
11938031591dSSrihari Venkatesan /*
11948031591dSSrihari Venkatesan * Sort each such set of the array by rank
11958031591dSSrihari Venkatesan */
11968031591dSSrihari Venkatesan for (i = start + 1; i < end; i++) {
11978031591dSSrihari Venkatesan j = i - 1;
11980e751525SEric Saxe tmp = h[i];
11998031591dSSrihari Venkatesan while (j >= start &&
12008031591dSSrihari Venkatesan pg_cmt_hier_rank(hier[j],
12018031591dSSrihari Venkatesan (pg_cmt_t *)tmp) == hier[j]) {
12028031591dSSrihari Venkatesan h[j + 1] = h[j];
12038031591dSSrihari Venkatesan j--;
12048031591dSSrihari Venkatesan }
12058031591dSSrihari Venkatesan h[j + 1] = tmp;
1206fb2f18f8Sesaxe }
1207fb2f18f8Sesaxe }
1208fb2f18f8Sesaxe }
1209fb2f18f8Sesaxe
1210fb2f18f8Sesaxe /*
1211fb2f18f8Sesaxe * Return a cmt_lgrp_t * given an lgroup handle.
1212fb2f18f8Sesaxe */
1213fb2f18f8Sesaxe static cmt_lgrp_t *
pg_cmt_find_lgrp(lgrp_handle_t hand)1214fb2f18f8Sesaxe pg_cmt_find_lgrp(lgrp_handle_t hand)
1215fb2f18f8Sesaxe {
1216fb2f18f8Sesaxe cmt_lgrp_t *lgrp;
1217fb2f18f8Sesaxe
1218fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock));
1219fb2f18f8Sesaxe
1220fb2f18f8Sesaxe lgrp = cmt_lgrps;
1221fb2f18f8Sesaxe while (lgrp != NULL) {
1222fb2f18f8Sesaxe if (lgrp->cl_hand == hand)
1223a6604450Sesaxe break;
1224fb2f18f8Sesaxe lgrp = lgrp->cl_next;
1225fb2f18f8Sesaxe }
1226a6604450Sesaxe return (lgrp);
1227a6604450Sesaxe }
1228fb2f18f8Sesaxe
1229fb2f18f8Sesaxe /*
1230a6604450Sesaxe * Create a cmt_lgrp_t with the specified handle.
1231fb2f18f8Sesaxe */
1232a6604450Sesaxe static cmt_lgrp_t *
pg_cmt_lgrp_create(lgrp_handle_t hand)1233a6604450Sesaxe pg_cmt_lgrp_create(lgrp_handle_t hand)
1234a6604450Sesaxe {
1235a6604450Sesaxe cmt_lgrp_t *lgrp;
1236a6604450Sesaxe
1237a6604450Sesaxe ASSERT(MUTEX_HELD(&cpu_lock));
1238a6604450Sesaxe
1239fb2f18f8Sesaxe lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1240fb2f18f8Sesaxe
1241fb2f18f8Sesaxe lgrp->cl_hand = hand;
1242fb2f18f8Sesaxe lgrp->cl_npgs = 0;
1243fb2f18f8Sesaxe lgrp->cl_next = cmt_lgrps;
1244fb2f18f8Sesaxe cmt_lgrps = lgrp;
1245fb2f18f8Sesaxe group_create(&lgrp->cl_pgs);
1246fb2f18f8Sesaxe
1247fb2f18f8Sesaxe return (lgrp);
1248fb2f18f8Sesaxe }
12496890d023SEric Saxe
12506890d023SEric Saxe /*
12510e751525SEric Saxe * Interfaces to enable and disable power aware dispatching
12520e751525SEric Saxe * The caller must be holding cpu_lock.
12536890d023SEric Saxe *
12540e751525SEric Saxe * Return 0 on success and -1 on failure.
12556890d023SEric Saxe */
12560e751525SEric Saxe int
cmt_pad_enable(pghw_type_t type)12570e751525SEric Saxe cmt_pad_enable(pghw_type_t type)
12586890d023SEric Saxe {
12590e751525SEric Saxe group_t *hwset;
12600e751525SEric Saxe group_iter_t iter;
12610e751525SEric Saxe pg_cmt_t *pg;
12626890d023SEric Saxe
12630e751525SEric Saxe ASSERT(PGHW_IS_PM_DOMAIN(type));
12640e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock));
12656890d023SEric Saxe
1266040ea03aSRichard Lowe if (cmt_sched_disabled == 1)
1267040ea03aSRichard Lowe return (-1);
1268040ea03aSRichard Lowe
12690e751525SEric Saxe if ((hwset = pghw_set_lookup(type)) == NULL ||
12700e751525SEric Saxe cmt_hw_blacklisted[type]) {
12710e751525SEric Saxe /*
12720e751525SEric Saxe * Unable to find any instances of the specified type
12730e751525SEric Saxe * of power domain, or the power domains have been blacklisted.
12740e751525SEric Saxe */
12750e751525SEric Saxe return (-1);
12760e751525SEric Saxe }
12776890d023SEric Saxe
12786890d023SEric Saxe /*
12790e751525SEric Saxe * Iterate over the power domains, setting the default dispatcher
12800e751525SEric Saxe * policy for power/performance optimization.
12810e751525SEric Saxe *
12820e751525SEric Saxe * Simply setting the policy isn't enough in the case where the power
12830e751525SEric Saxe * domain is an only child of another PG. Because the dispatcher walks
12840e751525SEric Saxe * the PG hierarchy in a top down fashion, the higher up PG's policy
12850e751525SEric Saxe * will dominate. So promote the power domain above it's parent if both
12860e751525SEric Saxe * PG and it's parent have the same CPUs to ensure it's policy
12870e751525SEric Saxe * dominates.
12886890d023SEric Saxe */
12890e751525SEric Saxe group_iter_init(&iter);
12900e751525SEric Saxe while ((pg = group_iterate(hwset, &iter)) != NULL) {
12910e751525SEric Saxe /*
12920e751525SEric Saxe * If the power domain is an only child to a parent
12930e751525SEric Saxe * not implementing the same policy, promote the child
12940e751525SEric Saxe * above the parent to activate the policy.
12950e751525SEric Saxe */
12960e751525SEric Saxe pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
12970e751525SEric Saxe while ((pg->cmt_parent != NULL) &&
12980e751525SEric Saxe (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
12990e751525SEric Saxe (PG_NUM_CPUS((pg_t *)pg) ==
13000e751525SEric Saxe PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
13011a77c24bSEric Saxe cmt_hier_promote(pg, NULL);
13020e751525SEric Saxe }
13030e751525SEric Saxe }
13040e751525SEric Saxe
13050e751525SEric Saxe return (0);
13060e751525SEric Saxe }
13070e751525SEric Saxe
13080e751525SEric Saxe int
cmt_pad_disable(pghw_type_t type)13090e751525SEric Saxe cmt_pad_disable(pghw_type_t type)
13100e751525SEric Saxe {
13110e751525SEric Saxe group_t *hwset;
13120e751525SEric Saxe group_iter_t iter;
13130e751525SEric Saxe pg_cmt_t *pg;
13140e751525SEric Saxe pg_cmt_t *child;
13150e751525SEric Saxe
13160e751525SEric Saxe ASSERT(PGHW_IS_PM_DOMAIN(type));
13170e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock));
13180e751525SEric Saxe
1319040ea03aSRichard Lowe if (cmt_sched_disabled == 1)
1320040ea03aSRichard Lowe return (-1);
1321040ea03aSRichard Lowe
13220e751525SEric Saxe if ((hwset = pghw_set_lookup(type)) == NULL) {
13230e751525SEric Saxe /*
13240e751525SEric Saxe * Unable to find any instances of the specified type of
13250e751525SEric Saxe * power domain.
13260e751525SEric Saxe */
13270e751525SEric Saxe return (-1);
13280e751525SEric Saxe }
13290e751525SEric Saxe /*
13300e751525SEric Saxe * Iterate over the power domains, setting the default dispatcher
13310e751525SEric Saxe * policy for performance optimization (load balancing).
13320e751525SEric Saxe */
13330e751525SEric Saxe group_iter_init(&iter);
13340e751525SEric Saxe while ((pg = group_iterate(hwset, &iter)) != NULL) {
13350e751525SEric Saxe
13360e751525SEric Saxe /*
13370e751525SEric Saxe * If the power domain has an only child that implements
13380e751525SEric Saxe * policy other than load balancing, promote the child
13390e751525SEric Saxe * above the power domain to ensure it's policy dominates.
13400e751525SEric Saxe */
1341f03808b6SEric Saxe if (pg->cmt_children != NULL &&
1342f03808b6SEric Saxe GROUP_SIZE(pg->cmt_children) == 1) {
13430e751525SEric Saxe child = GROUP_ACCESS(pg->cmt_children, 0);
13440e751525SEric Saxe if ((child->cmt_policy & CMT_BALANCE) == 0) {
13451a77c24bSEric Saxe cmt_hier_promote(child, NULL);
13460e751525SEric Saxe }
13470e751525SEric Saxe }
13480e751525SEric Saxe pg->cmt_policy = CMT_BALANCE;
13490e751525SEric Saxe }
13500e751525SEric Saxe return (0);
13510e751525SEric Saxe }
13520e751525SEric Saxe
13530e751525SEric Saxe /* ARGSUSED */
13540e751525SEric Saxe static void
cmt_ev_thread_swtch(pg_t * pg,cpu_t * cp,hrtime_t now,kthread_t * old,kthread_t * new)13550e751525SEric Saxe cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
13560e751525SEric Saxe kthread_t *new)
13570e751525SEric Saxe {
13580e751525SEric Saxe pg_cmt_t *cmt_pg = (pg_cmt_t *)pg;
13590e751525SEric Saxe
13600e751525SEric Saxe if (old == cp->cpu_idle_thread) {
13611a5e258fSJosef 'Jeff' Sipek atomic_inc_32(&cmt_pg->cmt_utilization);
13620e751525SEric Saxe } else if (new == cp->cpu_idle_thread) {
13631a5e258fSJosef 'Jeff' Sipek atomic_dec_32(&cmt_pg->cmt_utilization);
13640e751525SEric Saxe }
13650e751525SEric Saxe }
13660e751525SEric Saxe
13670e751525SEric Saxe /*
13680e751525SEric Saxe * Macro to test whether a thread is currently runnable on a CPU in a PG.
13690e751525SEric Saxe */
13700e751525SEric Saxe #define THREAD_RUNNABLE_IN_PG(t, pg) \
13710e751525SEric Saxe ((t)->t_state == TS_RUN && \
13720e751525SEric Saxe (t)->t_disp_queue->disp_cpu && \
13730e751525SEric Saxe bitset_in_set(&(pg)->cmt_cpus_actv_set, \
13740e751525SEric Saxe (t)->t_disp_queue->disp_cpu->cpu_seqid))
13750e751525SEric Saxe
13760e751525SEric Saxe static void
cmt_ev_thread_swtch_pwr(pg_t * pg,cpu_t * cp,hrtime_t now,kthread_t * old,kthread_t * new)13770e751525SEric Saxe cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
13780e751525SEric Saxe kthread_t *new)
13790e751525SEric Saxe {
13800e751525SEric Saxe pg_cmt_t *cmt = (pg_cmt_t *)pg;
13810e751525SEric Saxe cpupm_domain_t *dom;
13820e751525SEric Saxe uint32_t u;
13830e751525SEric Saxe
13840e751525SEric Saxe if (old == cp->cpu_idle_thread) {
13850e751525SEric Saxe ASSERT(new != cp->cpu_idle_thread);
13861a5e258fSJosef 'Jeff' Sipek u = atomic_inc_32_nv(&cmt->cmt_utilization);
13870e751525SEric Saxe if (u == 1) {
13880e751525SEric Saxe /*
13890e751525SEric Saxe * Notify the CPU power manager that the domain
13900e751525SEric Saxe * is non-idle.
13910e751525SEric Saxe */
13920e751525SEric Saxe dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
13930e751525SEric Saxe cpupm_utilization_event(cp, now, dom,
13940e751525SEric Saxe CPUPM_DOM_BUSY_FROM_IDLE);
13950e751525SEric Saxe }
13960e751525SEric Saxe } else if (new == cp->cpu_idle_thread) {
13970e751525SEric Saxe ASSERT(old != cp->cpu_idle_thread);
13981a5e258fSJosef 'Jeff' Sipek u = atomic_dec_32_nv(&cmt->cmt_utilization);
13990e751525SEric Saxe if (u == 0) {
14000e751525SEric Saxe /*
14010e751525SEric Saxe * The domain is idle, notify the CPU power
14020e751525SEric Saxe * manager.
14030e751525SEric Saxe *
14040e751525SEric Saxe * Avoid notifying if the thread is simply migrating
14050e751525SEric Saxe * between CPUs in the domain.
14060e751525SEric Saxe */
14070e751525SEric Saxe if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
14080e751525SEric Saxe dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
14090e751525SEric Saxe cpupm_utilization_event(cp, now, dom,
14100e751525SEric Saxe CPUPM_DOM_IDLE_FROM_BUSY);
14110e751525SEric Saxe }
14120e751525SEric Saxe }
14130e751525SEric Saxe }
14140e751525SEric Saxe }
14150e751525SEric Saxe
14160e751525SEric Saxe /* ARGSUSED */
14170e751525SEric Saxe static void
cmt_ev_thread_remain_pwr(pg_t * pg,cpu_t * cp,kthread_t * t)14180e751525SEric Saxe cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
14190e751525SEric Saxe {
14200e751525SEric Saxe pg_cmt_t *cmt = (pg_cmt_t *)pg;
14210e751525SEric Saxe cpupm_domain_t *dom;
14220e751525SEric Saxe
14230e751525SEric Saxe dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
14240e751525SEric Saxe cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
14250e751525SEric Saxe }
14260e751525SEric Saxe
14270e751525SEric Saxe /*
14280e751525SEric Saxe * Return the name of the CMT scheduling policy
14290e751525SEric Saxe * being implemented across this PG
14300e751525SEric Saxe */
14310e751525SEric Saxe static char *
pg_cmt_policy_name(pg_t * pg)14320e751525SEric Saxe pg_cmt_policy_name(pg_t *pg)
14330e751525SEric Saxe {
14340e751525SEric Saxe pg_cmt_policy_t policy;
14350e751525SEric Saxe
14360e751525SEric Saxe policy = ((pg_cmt_t *)pg)->cmt_policy;
14370e751525SEric Saxe
14380e751525SEric Saxe if (policy & CMT_AFFINITY) {
14390e751525SEric Saxe if (policy & CMT_BALANCE)
14400e751525SEric Saxe return ("Load Balancing & Affinity");
14410e751525SEric Saxe else if (policy & CMT_COALESCE)
14420e751525SEric Saxe return ("Load Coalescence & Affinity");
14436890d023SEric Saxe else
14440e751525SEric Saxe return ("Affinity");
14450e751525SEric Saxe } else {
14460e751525SEric Saxe if (policy & CMT_BALANCE)
14470e751525SEric Saxe return ("Load Balancing");
14480e751525SEric Saxe else if (policy & CMT_COALESCE)
14490e751525SEric Saxe return ("Load Coalescence");
14500e751525SEric Saxe else
14510e751525SEric Saxe return ("None");
14520e751525SEric Saxe }
14530e751525SEric Saxe }
14546890d023SEric Saxe
14556890d023SEric Saxe /*
14560e751525SEric Saxe * Prune PG, and all other instances of PG's hardware sharing relationship
1457d0e93b69SEric Saxe * from the CMT PG hierarchy.
14581a77c24bSEric Saxe *
14591a77c24bSEric Saxe * This routine operates on the CPU specific processor group data (for the CPUs
14601a77c24bSEric Saxe * in the PG being pruned), and may be invoked from a context where one CPU's
14611a77c24bSEric Saxe * PG data is under construction. In this case the argument "pgdata", if not
14621a77c24bSEric Saxe * NULL, is a reference to the CPU's under-construction PG data.
14636890d023SEric Saxe */
14640e751525SEric Saxe static int
pg_cmt_prune(pg_cmt_t * pg_bad,pg_cmt_t ** lineage,int * sz,cpu_pg_t * pgdata)14651a77c24bSEric Saxe pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
14660e751525SEric Saxe {
14670e751525SEric Saxe group_t *hwset, *children;
14680e751525SEric Saxe int i, j, r, size = *sz;
14690e751525SEric Saxe group_iter_t hw_iter, child_iter;
14700e751525SEric Saxe pg_cpu_itr_t cpu_iter;
14710e751525SEric Saxe pg_cmt_t *pg, *child;
14720e751525SEric Saxe cpu_t *cpu;
14730e751525SEric Saxe int cap_needed;
14740e751525SEric Saxe pghw_type_t hw;
14756890d023SEric Saxe
14760e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock));
14776890d023SEric Saxe
1478d3c97224SAlexander Kolbasov /*
1479d3c97224SAlexander Kolbasov * Inform pghw layer that this PG is pruned.
1480d3c97224SAlexander Kolbasov */
1481d3c97224SAlexander Kolbasov pghw_cmt_fini((pghw_t *)pg_bad);
1482d3c97224SAlexander Kolbasov
14830e751525SEric Saxe hw = ((pghw_t *)pg_bad)->pghw_hw;
14840e751525SEric Saxe
14850e751525SEric Saxe if (hw == PGHW_POW_ACTIVE) {
14860e751525SEric Saxe cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
14870e751525SEric Saxe "Event Based CPUPM Unavailable");
14880e751525SEric Saxe } else if (hw == PGHW_POW_IDLE) {
14890e751525SEric Saxe cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
14900e751525SEric Saxe "Dispatcher assisted CPUPM disabled.");
14910e751525SEric Saxe }
14926890d023SEric Saxe
14936890d023SEric Saxe /*
14940e751525SEric Saxe * Find and eliminate the PG from the lineage.
14956890d023SEric Saxe */
14960e751525SEric Saxe for (i = 0; i < size; i++) {
14970e751525SEric Saxe if (lineage[i] == pg_bad) {
14980e751525SEric Saxe for (j = i; j < size - 1; j++)
14990e751525SEric Saxe lineage[j] = lineage[j + 1];
15000e751525SEric Saxe *sz = size - 1;
15010e751525SEric Saxe break;
15020e751525SEric Saxe }
15030e751525SEric Saxe }
15040e751525SEric Saxe
15050e751525SEric Saxe /*
15060e751525SEric Saxe * We'll prune all instances of the hardware sharing relationship
15070e751525SEric Saxe * represented by pg. But before we do that (and pause CPUs) we need
15080e751525SEric Saxe * to ensure the hierarchy's groups are properly sized.
15090e751525SEric Saxe */
15100e751525SEric Saxe hwset = pghw_set_lookup(hw);
15110e751525SEric Saxe
15120e751525SEric Saxe /*
1513d0e93b69SEric Saxe * Blacklist the hardware so future processor groups of this type won't
1514d0e93b69SEric Saxe * participate in CMT thread placement.
1515d0e93b69SEric Saxe *
1516d0e93b69SEric Saxe * XXX
1517d0e93b69SEric Saxe * For heterogeneous system configurations, this might be overkill.
1518d0e93b69SEric Saxe * We may only need to blacklist the illegal PGs, and other instances
1519d0e93b69SEric Saxe * of this hardware sharing relationship may be ok.
15200e751525SEric Saxe */
15210e751525SEric Saxe cmt_hw_blacklisted[hw] = 1;
15220e751525SEric Saxe
15230e751525SEric Saxe /*
15240e751525SEric Saxe * For each of the PGs being pruned, ensure sufficient capacity in
15250e751525SEric Saxe * the siblings set for the PG's children
15260e751525SEric Saxe */
15270e751525SEric Saxe group_iter_init(&hw_iter);
15280e751525SEric Saxe while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
15290e751525SEric Saxe /*
15300e751525SEric Saxe * PG is being pruned, but if it is bringing up more than
15310e751525SEric Saxe * one child, ask for more capacity in the siblings group.
15320e751525SEric Saxe */
15330e751525SEric Saxe cap_needed = 0;
15340e751525SEric Saxe if (pg->cmt_children &&
15350e751525SEric Saxe GROUP_SIZE(pg->cmt_children) > 1) {
15360e751525SEric Saxe cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
15370e751525SEric Saxe
15380e751525SEric Saxe group_expand(pg->cmt_siblings,
15390e751525SEric Saxe GROUP_SIZE(pg->cmt_siblings) + cap_needed);
15400e751525SEric Saxe
15410e751525SEric Saxe /*
15420e751525SEric Saxe * If this is a top level group, also ensure the
15430e751525SEric Saxe * capacity in the root lgrp level CMT grouping.
15440e751525SEric Saxe */
15450e751525SEric Saxe if (pg->cmt_parent == NULL &&
15460e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) {
15470e751525SEric Saxe group_expand(&cmt_root->cl_pgs,
15480e751525SEric Saxe GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1549d0e93b69SEric Saxe cmt_root->cl_npgs += cap_needed;
15500e751525SEric Saxe }
15510e751525SEric Saxe }
15520e751525SEric Saxe }
15530e751525SEric Saxe
15540e751525SEric Saxe /*
15550e751525SEric Saxe * We're operating on the PG hierarchy. Pause CPUs to ensure
15560e751525SEric Saxe * exclusivity with respect to the dispatcher.
15570e751525SEric Saxe */
1558*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL);
15590e751525SEric Saxe
15600e751525SEric Saxe /*
15610e751525SEric Saxe * Prune all PG instances of the hardware sharing relationship
15620e751525SEric Saxe * represented by pg.
15630e751525SEric Saxe */
15640e751525SEric Saxe group_iter_init(&hw_iter);
15650e751525SEric Saxe while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
15660e751525SEric Saxe
15670e751525SEric Saxe /*
15680e751525SEric Saxe * Remove PG from it's group of siblings, if it's there.
15690e751525SEric Saxe */
15700e751525SEric Saxe if (pg->cmt_siblings) {
15710e751525SEric Saxe (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
15720e751525SEric Saxe }
15730e751525SEric Saxe if (pg->cmt_parent == NULL &&
15740e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) {
15750e751525SEric Saxe (void) group_remove(&cmt_root->cl_pgs, pg,
15760e751525SEric Saxe GRP_NORESIZE);
15770e751525SEric Saxe }
1578d0e93b69SEric Saxe
1579d0e93b69SEric Saxe /*
1580d0e93b69SEric Saxe * Indicate that no CMT policy will be implemented across
1581d0e93b69SEric Saxe * this PG.
1582d0e93b69SEric Saxe */
1583d0e93b69SEric Saxe pg->cmt_policy = CMT_NO_POLICY;
1584d0e93b69SEric Saxe
15850e751525SEric Saxe /*
1586ef4f35d8SEric Saxe * Move PG's children from it's children set to it's parent's
1587ef4f35d8SEric Saxe * children set. Note that the parent's children set, and PG's
1588ef4f35d8SEric Saxe * siblings set are the same thing.
1589ef4f35d8SEric Saxe *
1590ef4f35d8SEric Saxe * Because we are iterating over the same group that we are
1591ef4f35d8SEric Saxe * operating on (removing the children), first add all of PG's
1592ef4f35d8SEric Saxe * children to the parent's children set, and once we are done
1593ef4f35d8SEric Saxe * iterating, empty PG's children set.
15940e751525SEric Saxe */
15950e751525SEric Saxe if (pg->cmt_children != NULL) {
15960e751525SEric Saxe children = pg->cmt_children;
15970e751525SEric Saxe
15980e751525SEric Saxe group_iter_init(&child_iter);
15990e751525SEric Saxe while ((child = group_iterate(children, &child_iter))
16000e751525SEric Saxe != NULL) {
1601ef4f35d8SEric Saxe if (pg->cmt_siblings != NULL) {
16020e751525SEric Saxe r = group_add(pg->cmt_siblings, child,
16030e751525SEric Saxe GRP_NORESIZE);
16040e751525SEric Saxe ASSERT(r == 0);
1605d0e93b69SEric Saxe
1606d0e93b69SEric Saxe if (pg->cmt_parent == NULL &&
1607d0e93b69SEric Saxe pg->cmt_siblings !=
1608d0e93b69SEric Saxe &cmt_root->cl_pgs) {
1609d0e93b69SEric Saxe r = group_add(&cmt_root->cl_pgs,
1610d0e93b69SEric Saxe child, GRP_NORESIZE);
1611d0e93b69SEric Saxe ASSERT(r == 0);
1612d0e93b69SEric Saxe }
16130e751525SEric Saxe }
16140e751525SEric Saxe }
1615ef4f35d8SEric Saxe group_empty(pg->cmt_children);
16160e751525SEric Saxe }
16170e751525SEric Saxe
16180e751525SEric Saxe /*
16190e751525SEric Saxe * Reset the callbacks to the defaults
16200e751525SEric Saxe */
16210e751525SEric Saxe pg_callback_set_defaults((pg_t *)pg);
16220e751525SEric Saxe
16230e751525SEric Saxe /*
16240e751525SEric Saxe * Update all the CPU lineages in each of PG's CPUs
16250e751525SEric Saxe */
16260e751525SEric Saxe PG_CPU_ITR_INIT(pg, cpu_iter);
16270e751525SEric Saxe while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
16280e751525SEric Saxe pg_cmt_t *cpu_pg;
16290e751525SEric Saxe group_iter_t liter; /* Iterator for the lineage */
16301a77c24bSEric Saxe cpu_pg_t *cpd; /* CPU's PG data */
16311a77c24bSEric Saxe
16321a77c24bSEric Saxe /*
16331a77c24bSEric Saxe * The CPU's lineage is under construction still
16341a77c24bSEric Saxe * references the bootstrap CPU PG data structure.
16351a77c24bSEric Saxe */
16361a77c24bSEric Saxe if (pg_cpu_is_bootstrapped(cpu))
16371a77c24bSEric Saxe cpd = pgdata;
16381a77c24bSEric Saxe else
16391a77c24bSEric Saxe cpd = cpu->cpu_pg;
16400e751525SEric Saxe
16410e751525SEric Saxe /*
16420e751525SEric Saxe * Iterate over the CPU's PGs updating the children
16430e751525SEric Saxe * of the PG being promoted, since they have a new
16440e751525SEric Saxe * parent and siblings set.
16450e751525SEric Saxe */
16460e751525SEric Saxe group_iter_init(&liter);
16471a77c24bSEric Saxe while ((cpu_pg = group_iterate(&cpd->pgs,
16481a77c24bSEric Saxe &liter)) != NULL) {
16490e751525SEric Saxe if (cpu_pg->cmt_parent == pg) {
16500e751525SEric Saxe cpu_pg->cmt_parent = pg->cmt_parent;
16510e751525SEric Saxe cpu_pg->cmt_siblings = pg->cmt_siblings;
16520e751525SEric Saxe }
16530e751525SEric Saxe }
16540e751525SEric Saxe
16550e751525SEric Saxe /*
16560e751525SEric Saxe * Update the CPU's lineages
1657d0e93b69SEric Saxe *
1658d0e93b69SEric Saxe * Remove the PG from the CPU's group used for CMT
1659d0e93b69SEric Saxe * scheduling.
16600e751525SEric Saxe */
16611a77c24bSEric Saxe (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
16620e751525SEric Saxe }
16630e751525SEric Saxe }
16640e751525SEric Saxe start_cpus();
16650e751525SEric Saxe return (0);
16660e751525SEric Saxe }
16670e751525SEric Saxe
16680e751525SEric Saxe /*
16690e751525SEric Saxe * Disable CMT scheduling
16700e751525SEric Saxe */
16710e751525SEric Saxe static void
pg_cmt_disable(void)16720e751525SEric Saxe pg_cmt_disable(void)
16730e751525SEric Saxe {
16740e751525SEric Saxe cpu_t *cpu;
16750e751525SEric Saxe
16761a77c24bSEric Saxe ASSERT(MUTEX_HELD(&cpu_lock));
16771a77c24bSEric Saxe
1678*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL);
16790e751525SEric Saxe cpu = cpu_list;
16800e751525SEric Saxe
16816890d023SEric Saxe do {
16820e751525SEric Saxe if (cpu->cpu_pg)
16830e751525SEric Saxe group_empty(&cpu->cpu_pg->cmt_pgs);
16840e751525SEric Saxe } while ((cpu = cpu->cpu_next) != cpu_list);
16850e751525SEric Saxe
16860e751525SEric Saxe cmt_sched_disabled = 1;
16870e751525SEric Saxe start_cpus();
16880e751525SEric Saxe cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
16890e751525SEric Saxe }
16900e751525SEric Saxe
1691ef4f35d8SEric Saxe /*
1692ef4f35d8SEric Saxe * CMT lineage validation
1693ef4f35d8SEric Saxe *
1694ef4f35d8SEric Saxe * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
1695ef4f35d8SEric Saxe * of the PGs in a CPU's lineage. This is necessary because it's possible that
1696ef4f35d8SEric Saxe * some groupings (power domain groupings in particular) may be defined by
1697ef4f35d8SEric Saxe * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
1698ef4f35d8SEric Saxe * possible to integrate those groupings into the CMT PG hierarchy, if doing
1699ef4f35d8SEric Saxe * so would violate the subset invariant of the hierarchy, which says that
1700ef4f35d8SEric Saxe * a PG must be subset of its parent (if it has one).
1701ef4f35d8SEric Saxe *
1702ef4f35d8SEric Saxe * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
1703ef4f35d8SEric Saxe * would result in a violation of this invariant. If a violation is found,
1704ef4f35d8SEric Saxe * and the PG is of a grouping type who's definition is known to originate from
1705ef4f35d8SEric Saxe * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
1706b025faeeSEric Saxe * PG (and all other instances PG's sharing relationship type) from the CMT
1707ef4f35d8SEric Saxe * hierarchy. Further, future instances of that sharing relationship type won't
1708b025faeeSEric Saxe * be added. If the grouping definition doesn't originate from suspect
1709ef4f35d8SEric Saxe * sources, then pg_cmt_disable() will be invoked to log an error, and disable
1710ef4f35d8SEric Saxe * CMT scheduling altogether.
1711ef4f35d8SEric Saxe *
1712ef4f35d8SEric Saxe * This routine is invoked after the CPU has been added to the PGs in which
1713ef4f35d8SEric Saxe * it belongs, but before those PGs have been added to (or had their place
1714ef4f35d8SEric Saxe * adjusted in) the CMT PG hierarchy.
1715ef4f35d8SEric Saxe *
1716ef4f35d8SEric Saxe * The first argument is the CPUs PG lineage (essentially an array of PGs in
1717ef4f35d8SEric Saxe * which the CPU belongs) that has already been sorted in ascending order
1718ef4f35d8SEric Saxe * by CPU count. Some of the PGs in the CPUs lineage may already have other
1719ef4f35d8SEric Saxe * CPUs in them, and have already been integrated into the CMT hierarchy.
1720ef4f35d8SEric Saxe *
1721ef4f35d8SEric Saxe * The addition of this new CPU to these pre-existing PGs means that those
1722ef4f35d8SEric Saxe * PGs may need to be promoted up in the hierarchy to satisfy the subset
1723ef4f35d8SEric Saxe * invariant. In additon to testing the subset invariant for the lineage,
1724ef4f35d8SEric Saxe * this routine also verifies that the addition of the new CPU to the
1725ef4f35d8SEric Saxe * existing PGs wouldn't cause the subset invariant to be violated in
1726ef4f35d8SEric Saxe * the exiting lineages.
1727ef4f35d8SEric Saxe *
1728ef4f35d8SEric Saxe * This routine will normally return one of the following:
1729ef4f35d8SEric Saxe * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
1730ef4f35d8SEric Saxe * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
1731ef4f35d8SEric Saxe *
1732ef4f35d8SEric Saxe * Otherwise, this routine will return a value indicating which error it
1733ef4f35d8SEric Saxe * was unable to recover from (and set cmt_lineage_status along the way).
17341a77c24bSEric Saxe *
17351a77c24bSEric Saxe * This routine operates on the CPU specific processor group data (for the CPU
17361a77c24bSEric Saxe * whose lineage is being validated), which is under-construction.
17371a77c24bSEric Saxe * "pgdata" is a reference to the CPU's under-construction PG data.
17381a77c24bSEric Saxe * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
1739ef4f35d8SEric Saxe */
1740ef4f35d8SEric Saxe static cmt_lineage_validation_t
pg_cmt_lineage_validate(pg_cmt_t ** lineage,int * sz,cpu_pg_t * pgdata)17411a77c24bSEric Saxe pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
17420e751525SEric Saxe {
1743ef4f35d8SEric Saxe int i, j, size;
1744b025faeeSEric Saxe pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp, *parent;
17450e751525SEric Saxe cpu_t *cp;
17460e751525SEric Saxe pg_cpu_itr_t cpu_iter;
1747ef4f35d8SEric Saxe lgrp_handle_t lgrp;
17480e751525SEric Saxe
17490e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock));
17500e751525SEric Saxe
17510e751525SEric Saxe revalidate:
17520e751525SEric Saxe size = *sz;
17530e751525SEric Saxe pg_bad = NULL;
1754ef4f35d8SEric Saxe lgrp = LGRP_NULL_HANDLE;
1755ef4f35d8SEric Saxe for (i = 0; i < size; i++) {
17560e751525SEric Saxe
17570e751525SEric Saxe pg = lineage[i];
1758ef4f35d8SEric Saxe if (i < size - 1)
1759ef4f35d8SEric Saxe pg_next = lineage[i + 1];
1760ef4f35d8SEric Saxe else
1761ef4f35d8SEric Saxe pg_next = NULL;
17626890d023SEric Saxe
17636890d023SEric Saxe /*
17640e751525SEric Saxe * We assume that the lineage has already been sorted
17650e751525SEric Saxe * by the number of CPUs. In fact, we depend on it.
17666890d023SEric Saxe */
1767ef4f35d8SEric Saxe ASSERT(pg_next == NULL ||
1768ef4f35d8SEric Saxe (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));
17696890d023SEric Saxe
17706890d023SEric Saxe /*
1771b025faeeSEric Saxe * The CPUs PG lineage was passed as the first argument to
1772b025faeeSEric Saxe * this routine and contains the sorted list of the CPU's
1773b025faeeSEric Saxe * PGs. Ultimately, the ordering of the PGs in that list, and
1774b025faeeSEric Saxe * the ordering as traversed by the cmt_parent list must be
1775b025faeeSEric Saxe * the same. PG promotion will be used as the mechanism to
1776b025faeeSEric Saxe * achieve this, but first we need to look for cases where
1777b025faeeSEric Saxe * promotion will be necessary, and validate that will be
1778b025faeeSEric Saxe * possible without violating the subset invarient described
1779b025faeeSEric Saxe * above.
1780ef4f35d8SEric Saxe *
1781ef4f35d8SEric Saxe * Since the PG topology is in the middle of being changed, we
1782ef4f35d8SEric Saxe * need to check whether the PG's existing parent (if any) is
1783b025faeeSEric Saxe * part of this CPU's lineage (and therefore should contain
1784b025faeeSEric Saxe * the new CPU). If not, it means that the addition of the
1785b025faeeSEric Saxe * new CPU should have made this PG have more CPUs than its
1786b025faeeSEric Saxe * parent (and other ancestors not in the same lineage) and
1787b025faeeSEric Saxe * will need to be promoted into place.
1788b025faeeSEric Saxe *
1789b025faeeSEric Saxe * We need to verify all of this to defend against a buggy
1790ef4f35d8SEric Saxe * BIOS giving bad power domain CPU groupings. Sigh.
1791ef4f35d8SEric Saxe */
1792b025faeeSEric Saxe parent = pg->cmt_parent;
1793b025faeeSEric Saxe while (parent != NULL) {
1794ef4f35d8SEric Saxe /*
1795b025faeeSEric Saxe * Determine if the parent/ancestor is in this lineage
1796ef4f35d8SEric Saxe */
1797b025faeeSEric Saxe pg_tmp = NULL;
1798b025faeeSEric Saxe for (j = 0; (j < size) && (pg_tmp != parent); j++) {
1799ef4f35d8SEric Saxe pg_tmp = lineage[j];
1800b025faeeSEric Saxe }
1801b025faeeSEric Saxe if (pg_tmp == parent) {
1802b025faeeSEric Saxe /*
1803b025faeeSEric Saxe * It's in the lineage. The concentricity
1804b025faeeSEric Saxe * checks will handle the rest.
1805b025faeeSEric Saxe */
1806ef4f35d8SEric Saxe break;
1807ef4f35d8SEric Saxe }
1808ef4f35d8SEric Saxe /*
1809b025faeeSEric Saxe * If it is not in the lineage, PG will eventually
1810b025faeeSEric Saxe * need to be promoted above it. Verify the ancestor
1811b025faeeSEric Saxe * is a proper subset. There is still an error if
1812b025faeeSEric Saxe * the ancestor has the same number of CPUs as PG,
1813b025faeeSEric Saxe * since that would imply it should be in the lineage,
1814b025faeeSEric Saxe * and we already know it isn't.
1815ef4f35d8SEric Saxe */
1816b025faeeSEric Saxe if (PG_NUM_CPUS((pg_t *)parent) >=
1817ef4f35d8SEric Saxe PG_NUM_CPUS((pg_t *)pg)) {
1818ef4f35d8SEric Saxe /*
1819b025faeeSEric Saxe * Not a proper subset if the parent/ancestor
1820b025faeeSEric Saxe * has the same or more CPUs than PG.
1821ef4f35d8SEric Saxe */
1822b025faeeSEric Saxe cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE;
1823ef4f35d8SEric Saxe goto handle_error;
1824ef4f35d8SEric Saxe }
1825b025faeeSEric Saxe parent = parent->cmt_parent;
1826ef4f35d8SEric Saxe }
1827ef4f35d8SEric Saxe
1828ef4f35d8SEric Saxe /*
1829ef4f35d8SEric Saxe * Walk each of the CPUs in the PGs group and perform
1830ef4f35d8SEric Saxe * consistency checks along the way.
18316890d023SEric Saxe */
18320e751525SEric Saxe PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
18330e751525SEric Saxe while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1834ef4f35d8SEric Saxe /*
1835ef4f35d8SEric Saxe * Verify that there aren't any CPUs contained in PG
1836ef4f35d8SEric Saxe * that the next PG in the lineage (which is larger
1837ef4f35d8SEric Saxe * or same size) doesn't also contain.
1838ef4f35d8SEric Saxe */
1839ef4f35d8SEric Saxe if (pg_next != NULL &&
1840ef4f35d8SEric Saxe pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
18410e751525SEric Saxe cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
18420e751525SEric Saxe goto handle_error;
18436890d023SEric Saxe }
1844ef4f35d8SEric Saxe
1845ef4f35d8SEric Saxe /*
1846ef4f35d8SEric Saxe * Verify that all the CPUs in the PG are in the same
1847ef4f35d8SEric Saxe * lgroup.
1848ef4f35d8SEric Saxe */
1849ef4f35d8SEric Saxe if (lgrp == LGRP_NULL_HANDLE) {
1850ef4f35d8SEric Saxe lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
1851ef4f35d8SEric Saxe } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
1852ef4f35d8SEric Saxe cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
1853ef4f35d8SEric Saxe goto handle_error;
1854ef4f35d8SEric Saxe }
18550e751525SEric Saxe }
18566890d023SEric Saxe }
18576890d023SEric Saxe
18580e751525SEric Saxe handle_error:
1859ef4f35d8SEric Saxe /*
1860ef4f35d8SEric Saxe * Some of these validation errors can result when the CPU grouping
1861ef4f35d8SEric Saxe * information is derived from buggy sources (for example, incorrect
1862ef4f35d8SEric Saxe * ACPI tables on x86 systems).
1863ef4f35d8SEric Saxe *
1864ef4f35d8SEric Saxe * We'll try to recover in such cases by pruning out the illegal
1865ef4f35d8SEric Saxe * groupings from the PG hierarchy, which means that we won't optimize
1866ef4f35d8SEric Saxe * for those levels, but we will for the remaining ones.
1867ef4f35d8SEric Saxe */
18680e751525SEric Saxe switch (cmt_lineage_status) {
18690e751525SEric Saxe case CMT_LINEAGE_VALID:
18700e751525SEric Saxe case CMT_LINEAGE_REPAIRED:
18710e751525SEric Saxe break;
1872ef4f35d8SEric Saxe case CMT_LINEAGE_PG_SPANS_LGRPS:
1873ef4f35d8SEric Saxe /*
1874ef4f35d8SEric Saxe * We've detected a PG whose CPUs span lgroups.
1875ef4f35d8SEric Saxe *
1876ef4f35d8SEric Saxe * This isn't supported, as the dispatcher isn't allowed to
1877ef4f35d8SEric Saxe * to do CMT thread placement across lgroups, as this would
1878ef4f35d8SEric Saxe * conflict with policies implementing MPO thread affinity.
1879ef4f35d8SEric Saxe *
1880d0e93b69SEric Saxe * If the PG is of a sharing relationship type known to
1881d0e93b69SEric Saxe * legitimately span lgroups, specify that no CMT thread
1882d0e93b69SEric Saxe * placement policy should be implemented, and prune the PG
1883d0e93b69SEric Saxe * from the existing CMT PG hierarchy.
1884d0e93b69SEric Saxe *
1885d0e93b69SEric Saxe * Otherwise, fall though to the case below for handling.
1886ef4f35d8SEric Saxe */
1887d0e93b69SEric Saxe if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) {
1888d0e93b69SEric Saxe if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1889d0e93b69SEric Saxe cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1890d0e93b69SEric Saxe goto revalidate;
1891d0e93b69SEric Saxe }
1892d0e93b69SEric Saxe }
1893d0e93b69SEric Saxe /*LINTED*/
1894ef4f35d8SEric Saxe case CMT_LINEAGE_NON_PROMOTABLE:
1895ef4f35d8SEric Saxe /*
1896ef4f35d8SEric Saxe * We've detected a PG that already exists in another CPU's
1897ef4f35d8SEric Saxe * lineage that cannot cannot legally be promoted into place
1898ef4f35d8SEric Saxe * without breaking the invariants of the hierarchy.
1899ef4f35d8SEric Saxe */
1900ef4f35d8SEric Saxe if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
19011a77c24bSEric Saxe if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1902ef4f35d8SEric Saxe cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1903ef4f35d8SEric Saxe goto revalidate;
1904ef4f35d8SEric Saxe }
1905ef4f35d8SEric Saxe }
1906ef4f35d8SEric Saxe /*
1907ef4f35d8SEric Saxe * Something went wrong trying to prune out the bad level.
1908ef4f35d8SEric Saxe * Disable CMT scheduling altogether.
1909ef4f35d8SEric Saxe */
1910ef4f35d8SEric Saxe pg_cmt_disable();
1911ef4f35d8SEric Saxe break;
19120e751525SEric Saxe case CMT_LINEAGE_NON_CONCENTRIC:
19136890d023SEric Saxe /*
1914ef4f35d8SEric Saxe * We've detected a non-concentric PG lineage, which means that
1915ef4f35d8SEric Saxe * there's a PG in the lineage that has CPUs that the next PG
1916ef4f35d8SEric Saxe * over in the lineage (which is the same size or larger)
1917ef4f35d8SEric Saxe * doesn't have.
19180e751525SEric Saxe *
1919ef4f35d8SEric Saxe * In this case, we examine the two PGs to see if either
1920ef4f35d8SEric Saxe * grouping is defined by potentially buggy sources.
19210e751525SEric Saxe *
19220e751525SEric Saxe * If one has less CPUs than the other, and contains CPUs
19230e751525SEric Saxe * not found in the parent, and it is an untrusted enumeration,
19240e751525SEric Saxe * then prune it. If both have the same number of CPUs, then
19250e751525SEric Saxe * prune the one that is untrusted.
19260e751525SEric Saxe *
19270e751525SEric Saxe * This process repeats until we have a concentric lineage,
19280e751525SEric Saxe * or we would have to prune out level derived from what we
19290e751525SEric Saxe * thought was a reliable source, in which case CMT scheduling
1930ef4f35d8SEric Saxe * is disabled altogether.
19316890d023SEric Saxe */
1932ef4f35d8SEric Saxe if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
19330e751525SEric Saxe (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
19340e751525SEric Saxe pg_bad = pg;
19350e751525SEric Saxe } else if (PG_NUM_CPUS((pg_t *)pg) ==
1936ef4f35d8SEric Saxe PG_NUM_CPUS((pg_t *)pg_next)) {
1937ef4f35d8SEric Saxe if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
1938ef4f35d8SEric Saxe pg_bad = pg_next;
19390e751525SEric Saxe } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
19400e751525SEric Saxe pg_bad = pg;
19416890d023SEric Saxe }
19426890d023SEric Saxe }
19430e751525SEric Saxe if (pg_bad) {
19441a77c24bSEric Saxe if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
19450e751525SEric Saxe cmt_lineage_status = CMT_LINEAGE_REPAIRED;
19460e751525SEric Saxe goto revalidate;
19470e751525SEric Saxe }
19480e751525SEric Saxe }
19490e751525SEric Saxe /*
1950ef4f35d8SEric Saxe * Something went wrong trying to identify and/or prune out
1951ef4f35d8SEric Saxe * the bad level. Disable CMT scheduling altogether.
19520e751525SEric Saxe */
19530e751525SEric Saxe pg_cmt_disable();
1954ef4f35d8SEric Saxe break;
1955ef4f35d8SEric Saxe default:
1956ef4f35d8SEric Saxe /*
1957ef4f35d8SEric Saxe * If we're here, we've encountered a validation error for
1958ef4f35d8SEric Saxe * which we don't know how to recover. In this case, disable
1959ef4f35d8SEric Saxe * CMT scheduling altogether.
1960ef4f35d8SEric Saxe */
19610e751525SEric Saxe cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1962ef4f35d8SEric Saxe pg_cmt_disable();
19630e751525SEric Saxe }
1964ef4f35d8SEric Saxe return (cmt_lineage_status);
19656890d023SEric Saxe }
1966