1fb2f18f8Sesaxe /* 2fb2f18f8Sesaxe * CDDL HEADER START 3fb2f18f8Sesaxe * 4fb2f18f8Sesaxe * The contents of this file are subject to the terms of the 5fb2f18f8Sesaxe * Common Development and Distribution License (the "License"). 6fb2f18f8Sesaxe * You may not use this file except in compliance with the License. 7fb2f18f8Sesaxe * 8fb2f18f8Sesaxe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fb2f18f8Sesaxe * or http://www.opensolaris.org/os/licensing. 10fb2f18f8Sesaxe * See the License for the specific language governing permissions 11fb2f18f8Sesaxe * and limitations under the License. 12fb2f18f8Sesaxe * 13fb2f18f8Sesaxe * When distributing Covered Code, include this CDDL HEADER in each 14fb2f18f8Sesaxe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fb2f18f8Sesaxe * If applicable, add the following below this CDDL HEADER, with the 16fb2f18f8Sesaxe * fields enclosed by brackets "[]" replaced with your own identifying 17fb2f18f8Sesaxe * information: Portions Copyright [yyyy] [name of copyright owner] 18fb2f18f8Sesaxe * 19fb2f18f8Sesaxe * CDDL HEADER END 20fb2f18f8Sesaxe */ 21fb2f18f8Sesaxe /* 223e81cacfSEric Saxe * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23fb2f18f8Sesaxe * Use is subject to license terms. 24fb2f18f8Sesaxe */ 25fb2f18f8Sesaxe 26fb2f18f8Sesaxe #include <sys/systm.h> 27fb2f18f8Sesaxe #include <sys/types.h> 28fb2f18f8Sesaxe #include <sys/param.h> 29fb2f18f8Sesaxe #include <sys/thread.h> 30fb2f18f8Sesaxe #include <sys/cpuvar.h> 31fb2f18f8Sesaxe #include <sys/cpupart.h> 32fb2f18f8Sesaxe #include <sys/kmem.h> 33fb2f18f8Sesaxe #include <sys/cmn_err.h> 34fb2f18f8Sesaxe #include <sys/kstat.h> 35fb2f18f8Sesaxe #include <sys/processor.h> 36fb2f18f8Sesaxe #include <sys/disp.h> 37fb2f18f8Sesaxe #include <sys/group.h> 38fb2f18f8Sesaxe #include <sys/pghw.h> 39fb2f18f8Sesaxe #include <sys/bitset.h> 40fb2f18f8Sesaxe #include <sys/lgrp.h> 41fb2f18f8Sesaxe #include <sys/cmt.h> 42*0e751525SEric Saxe #include <sys/cpu_pm.h> 43fb2f18f8Sesaxe 44fb2f18f8Sesaxe /* 45fb2f18f8Sesaxe * CMT scheduler / dispatcher support 46fb2f18f8Sesaxe * 47fb2f18f8Sesaxe * This file implements CMT scheduler support using Processor Groups. 48fb2f18f8Sesaxe * The CMT processor group class creates and maintains the CMT class 49fb2f18f8Sesaxe * specific processor group pg_cmt_t. 50fb2f18f8Sesaxe * 51fb2f18f8Sesaxe * ---------------------------- <-- pg_cmt_t * 52fb2f18f8Sesaxe * | pghw_t | 53fb2f18f8Sesaxe * ---------------------------- 54fb2f18f8Sesaxe * | CMT class specific data | 55fb2f18f8Sesaxe * | - hierarchy linkage | 56fb2f18f8Sesaxe * | - CMT load balancing data| 57fb2f18f8Sesaxe * | - active CPU group/bitset| 58fb2f18f8Sesaxe * ---------------------------- 59fb2f18f8Sesaxe * 60fb2f18f8Sesaxe * The scheduler/dispatcher leverages knowledge of the performance 61fb2f18f8Sesaxe * relevant CMT sharing relationships existing between cpus to implement 62*0e751525SEric Saxe * optimized affinity, load balancing, and coalescence policies. 63fb2f18f8Sesaxe * 64fb2f18f8Sesaxe * Load balancing policy seeks to improve performance by minimizing 65*0e751525SEric Saxe * contention over shared processor resources / facilities, Affinity 66*0e751525SEric Saxe * policies seek to improve cache and TLB utilization. Coalescence 67*0e751525SEric Saxe * policies improve resource utilization and ultimately power efficiency. 68fb2f18f8Sesaxe * 69fb2f18f8Sesaxe * The CMT PGs created by this class are already arranged into a 70fb2f18f8Sesaxe * hierarchy (which is done in the pghw layer). To implement the top-down 71fb2f18f8Sesaxe * CMT load balancing algorithm, the CMT PGs additionally maintain 72fb2f18f8Sesaxe * parent, child and sibling hierarchy relationships. 73fb2f18f8Sesaxe * Parent PGs always contain a superset of their children(s) resources, 74fb2f18f8Sesaxe * each PG can have at most one parent, and siblings are the group of PGs 75fb2f18f8Sesaxe * sharing the same parent. 76fb2f18f8Sesaxe * 77fb2f18f8Sesaxe * On NUMA systems, the CMT load balancing algorithm balances across the 78fb2f18f8Sesaxe * CMT PGs within their respective lgroups. On UMA based system, there 79fb2f18f8Sesaxe * exists a top level group of PGs to balance across. On NUMA systems multiple 80fb2f18f8Sesaxe * top level groups are instantiated, where the top level balancing begins by 81fb2f18f8Sesaxe * balancng across the CMT PGs within their respective (per lgroup) top level 82fb2f18f8Sesaxe * groups. 83fb2f18f8Sesaxe */ 84a6604450Sesaxe static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 85a6604450Sesaxe static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 86a6604450Sesaxe /* used for null_proc_lpa */ 87*0e751525SEric Saxe cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 88fb2f18f8Sesaxe 89a6604450Sesaxe static int is_cpu0 = 1; /* true if this is boot CPU context */ 90a6604450Sesaxe 91a6604450Sesaxe /* 92*0e751525SEric Saxe * Array of hardware sharing relationships that are blacklisted. 93*0e751525SEric Saxe * PGs won't be instantiated for blacklisted hardware sharing relationships. 94*0e751525SEric Saxe */ 95*0e751525SEric Saxe static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 96*0e751525SEric Saxe 97*0e751525SEric Saxe /* 98a6604450Sesaxe * Set this to non-zero to disable CMT scheduling 99a6604450Sesaxe * This must be done via kmdb -d, as /etc/system will be too late 100a6604450Sesaxe */ 101*0e751525SEric Saxe int cmt_sched_disabled = 0; 102fb2f18f8Sesaxe 103fb2f18f8Sesaxe static pg_cid_t pg_cmt_class_id; /* PG class id */ 104fb2f18f8Sesaxe 105fb2f18f8Sesaxe static pg_t *pg_cmt_alloc(); 106fb2f18f8Sesaxe static void pg_cmt_free(pg_t *); 107fb2f18f8Sesaxe static void pg_cmt_cpu_init(cpu_t *); 108fb2f18f8Sesaxe static void pg_cmt_cpu_fini(cpu_t *); 109fb2f18f8Sesaxe static void pg_cmt_cpu_active(cpu_t *); 110fb2f18f8Sesaxe static void pg_cmt_cpu_inactive(cpu_t *); 111fb2f18f8Sesaxe static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 112fb2f18f8Sesaxe static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 113*0e751525SEric Saxe static char *pg_cmt_policy_name(pg_t *); 114*0e751525SEric Saxe static void pg_cmt_hier_sort(pg_cmt_t **, int); 115*0e751525SEric Saxe static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 116fb2f18f8Sesaxe static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 117fb2f18f8Sesaxe static int pg_cmt_hw(pghw_type_t); 118fb2f18f8Sesaxe static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 119a6604450Sesaxe static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 120*0e751525SEric Saxe static int pg_cmt_lineage_validate(pg_cmt_t **, int *); 121*0e751525SEric Saxe static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 122*0e751525SEric Saxe kthread_t *, kthread_t *); 123*0e751525SEric Saxe static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 124*0e751525SEric Saxe kthread_t *, kthread_t *); 125*0e751525SEric Saxe static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 126fb2f18f8Sesaxe 127fb2f18f8Sesaxe /* 128fb2f18f8Sesaxe * Macro to test if PG is managed by the CMT PG class 129fb2f18f8Sesaxe */ 130fb2f18f8Sesaxe #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 131fb2f18f8Sesaxe 132fb2f18f8Sesaxe /* 133*0e751525SEric Saxe * Status codes for CMT lineage validation 134*0e751525SEric Saxe * See cmt_lineage_validate() below 135*0e751525SEric Saxe */ 136*0e751525SEric Saxe typedef enum cmt_lineage_validation { 137*0e751525SEric Saxe CMT_LINEAGE_VALID, 138*0e751525SEric Saxe CMT_LINEAGE_NON_CONCENTRIC, 139*0e751525SEric Saxe CMT_LINEAGE_REPAIRED, 140*0e751525SEric Saxe CMT_LINEAGE_UNRECOVERABLE 141*0e751525SEric Saxe } cmt_lineage_validation_t; 142*0e751525SEric Saxe 143*0e751525SEric Saxe /* 144*0e751525SEric Saxe * Status of the current lineage under construction. 145*0e751525SEric Saxe * One must be holding cpu_lock to change this. 146*0e751525SEric Saxe */ 147*0e751525SEric Saxe static cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 148*0e751525SEric Saxe 149*0e751525SEric Saxe /* 150*0e751525SEric Saxe * Power domain definitions (on x86) are defined by ACPI, and 151*0e751525SEric Saxe * therefore may be subject to BIOS bugs. 152*0e751525SEric Saxe */ 153*0e751525SEric Saxe #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 154*0e751525SEric Saxe 155*0e751525SEric Saxe /* 156fb2f18f8Sesaxe * CMT PG ops 157fb2f18f8Sesaxe */ 158fb2f18f8Sesaxe struct pg_ops pg_ops_cmt = { 159fb2f18f8Sesaxe pg_cmt_alloc, 160fb2f18f8Sesaxe pg_cmt_free, 161fb2f18f8Sesaxe pg_cmt_cpu_init, 162fb2f18f8Sesaxe pg_cmt_cpu_fini, 163fb2f18f8Sesaxe pg_cmt_cpu_active, 164fb2f18f8Sesaxe pg_cmt_cpu_inactive, 165fb2f18f8Sesaxe pg_cmt_cpupart_in, 166fb2f18f8Sesaxe NULL, /* cpupart_out */ 167fb2f18f8Sesaxe pg_cmt_cpupart_move, 168fb2f18f8Sesaxe pg_cmt_cpu_belongs, 169*0e751525SEric Saxe pg_cmt_policy_name, 170fb2f18f8Sesaxe }; 171fb2f18f8Sesaxe 172fb2f18f8Sesaxe /* 173fb2f18f8Sesaxe * Initialize the CMT PG class 174fb2f18f8Sesaxe */ 175fb2f18f8Sesaxe void 176fb2f18f8Sesaxe pg_cmt_class_init(void) 177fb2f18f8Sesaxe { 178fb2f18f8Sesaxe if (cmt_sched_disabled) 179fb2f18f8Sesaxe return; 180fb2f18f8Sesaxe 181fb2f18f8Sesaxe pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 182fb2f18f8Sesaxe } 183fb2f18f8Sesaxe 184fb2f18f8Sesaxe /* 185fb2f18f8Sesaxe * Called to indicate a new CPU has started up so 186fb2f18f8Sesaxe * that either t0 or the slave startup thread can 187fb2f18f8Sesaxe * be accounted for. 188fb2f18f8Sesaxe */ 189fb2f18f8Sesaxe void 190fb2f18f8Sesaxe pg_cmt_cpu_startup(cpu_t *cp) 191fb2f18f8Sesaxe { 192*0e751525SEric Saxe pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 193*0e751525SEric Saxe cp->cpu_thread); 194fb2f18f8Sesaxe } 195fb2f18f8Sesaxe 196fb2f18f8Sesaxe /* 197fb2f18f8Sesaxe * Return non-zero if thread can migrate between "from" and "to" 198fb2f18f8Sesaxe * without a performance penalty 199fb2f18f8Sesaxe */ 200fb2f18f8Sesaxe int 201fb2f18f8Sesaxe pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 202fb2f18f8Sesaxe { 203fb2f18f8Sesaxe if (from->cpu_physid->cpu_cacheid == 204fb2f18f8Sesaxe to->cpu_physid->cpu_cacheid) 205fb2f18f8Sesaxe return (1); 206fb2f18f8Sesaxe return (0); 207fb2f18f8Sesaxe } 208fb2f18f8Sesaxe 209fb2f18f8Sesaxe /* 210fb2f18f8Sesaxe * CMT class specific PG allocation 211fb2f18f8Sesaxe */ 212fb2f18f8Sesaxe static pg_t * 213fb2f18f8Sesaxe pg_cmt_alloc(void) 214fb2f18f8Sesaxe { 215fb2f18f8Sesaxe return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 216fb2f18f8Sesaxe } 217fb2f18f8Sesaxe 218fb2f18f8Sesaxe /* 219fb2f18f8Sesaxe * Class specific PG de-allocation 220fb2f18f8Sesaxe */ 221fb2f18f8Sesaxe static void 222fb2f18f8Sesaxe pg_cmt_free(pg_t *pg) 223fb2f18f8Sesaxe { 224fb2f18f8Sesaxe ASSERT(pg != NULL); 225fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg)); 226fb2f18f8Sesaxe 227fb2f18f8Sesaxe kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 228fb2f18f8Sesaxe } 229fb2f18f8Sesaxe 230fb2f18f8Sesaxe /* 231*0e751525SEric Saxe * Given a hardware sharing relationship, return which dispatcher 232*0e751525SEric Saxe * policies should be implemented to optimize performance and efficiency 233fb2f18f8Sesaxe */ 234*0e751525SEric Saxe static pg_cmt_policy_t 235*0e751525SEric Saxe pg_cmt_policy(pghw_type_t hw) 236fb2f18f8Sesaxe { 237*0e751525SEric Saxe pg_cmt_policy_t p; 238*0e751525SEric Saxe 239*0e751525SEric Saxe /* 240*0e751525SEric Saxe * Give the platform a chance to override the default 241*0e751525SEric Saxe */ 242*0e751525SEric Saxe if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 243*0e751525SEric Saxe return (p); 244*0e751525SEric Saxe 245*0e751525SEric Saxe switch (hw) { 246*0e751525SEric Saxe case PGHW_IPIPE: 247*0e751525SEric Saxe case PGHW_FPU: 248*0e751525SEric Saxe case PGHW_CHIP: 249*0e751525SEric Saxe return (CMT_BALANCE); 250*0e751525SEric Saxe case PGHW_CACHE: 251*0e751525SEric Saxe return (CMT_AFFINITY); 252*0e751525SEric Saxe case PGHW_POW_ACTIVE: 253*0e751525SEric Saxe case PGHW_POW_IDLE: 254*0e751525SEric Saxe return (CMT_BALANCE); 255*0e751525SEric Saxe default: 256*0e751525SEric Saxe return (CMT_NO_POLICY); 257*0e751525SEric Saxe } 258*0e751525SEric Saxe } 259*0e751525SEric Saxe 260*0e751525SEric Saxe /* 261*0e751525SEric Saxe * Rank the importance of optimizing for the pg1 relationship vs. 262*0e751525SEric Saxe * the pg2 relationship. 263*0e751525SEric Saxe */ 264*0e751525SEric Saxe static pg_cmt_t * 265*0e751525SEric Saxe pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 266*0e751525SEric Saxe { 267*0e751525SEric Saxe pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 268*0e751525SEric Saxe pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 269*0e751525SEric Saxe 270*0e751525SEric Saxe /* 271*0e751525SEric Saxe * A power domain is only important if CPUPM is enabled. 272*0e751525SEric Saxe */ 273*0e751525SEric Saxe if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 274*0e751525SEric Saxe if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 275*0e751525SEric Saxe return (pg2); 276*0e751525SEric Saxe if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 277*0e751525SEric Saxe return (pg1); 278*0e751525SEric Saxe } 279*0e751525SEric Saxe 280*0e751525SEric Saxe /* 281*0e751525SEric Saxe * Otherwise, ask the platform 282*0e751525SEric Saxe */ 283*0e751525SEric Saxe if (pg_plat_hw_rank(hw1, hw2) == hw1) 284*0e751525SEric Saxe return (pg1); 285*0e751525SEric Saxe else 286*0e751525SEric Saxe return (pg2); 287*0e751525SEric Saxe } 288*0e751525SEric Saxe 289*0e751525SEric Saxe /* 290*0e751525SEric Saxe * Initialize CMT callbacks for the given PG 291*0e751525SEric Saxe */ 292*0e751525SEric Saxe static void 293*0e751525SEric Saxe cmt_callback_init(pg_t *pg) 294*0e751525SEric Saxe { 295*0e751525SEric Saxe switch (((pghw_t *)pg)->pghw_hw) { 296*0e751525SEric Saxe case PGHW_POW_ACTIVE: 297*0e751525SEric Saxe pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 298*0e751525SEric Saxe pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 299*0e751525SEric Saxe break; 300*0e751525SEric Saxe default: 301*0e751525SEric Saxe pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 302*0e751525SEric Saxe 303*0e751525SEric Saxe } 304*0e751525SEric Saxe } 305*0e751525SEric Saxe 306*0e751525SEric Saxe /* 307*0e751525SEric Saxe * Promote PG above it's current parent. 308*0e751525SEric Saxe * This is only legal if PG has an equal or greater number of CPUs 309*0e751525SEric Saxe * than it's parent. 310*0e751525SEric Saxe */ 311*0e751525SEric Saxe static void 312*0e751525SEric Saxe cmt_hier_promote(pg_cmt_t *pg) 313*0e751525SEric Saxe { 314*0e751525SEric Saxe pg_cmt_t *parent; 315*0e751525SEric Saxe group_t *children; 316*0e751525SEric Saxe cpu_t *cpu; 317*0e751525SEric Saxe group_iter_t iter; 318*0e751525SEric Saxe pg_cpu_itr_t cpu_iter; 319*0e751525SEric Saxe int r; 320*0e751525SEric Saxe int err; 321*0e751525SEric Saxe 322*0e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 323*0e751525SEric Saxe 324*0e751525SEric Saxe parent = pg->cmt_parent; 325*0e751525SEric Saxe if (parent == NULL) { 326*0e751525SEric Saxe /* 327*0e751525SEric Saxe * Nothing to do 328*0e751525SEric Saxe */ 329*0e751525SEric Saxe return; 330*0e751525SEric Saxe } 331*0e751525SEric Saxe 332*0e751525SEric Saxe ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 333*0e751525SEric Saxe 334*0e751525SEric Saxe /* 335*0e751525SEric Saxe * We're changing around the hierarchy, which is actively traversed 336*0e751525SEric Saxe * by the dispatcher. Pause CPUS to ensure exclusivity. 337*0e751525SEric Saxe */ 338*0e751525SEric Saxe pause_cpus(NULL); 339*0e751525SEric Saxe 340*0e751525SEric Saxe /* 341*0e751525SEric Saxe * If necessary, update the parent's sibling set, replacing parent 342*0e751525SEric Saxe * with PG. 343*0e751525SEric Saxe */ 344*0e751525SEric Saxe if (parent->cmt_siblings) { 345*0e751525SEric Saxe if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 346*0e751525SEric Saxe != -1) { 347*0e751525SEric Saxe r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 348*0e751525SEric Saxe ASSERT(r != -1); 349*0e751525SEric Saxe } 350*0e751525SEric Saxe } 351*0e751525SEric Saxe 352*0e751525SEric Saxe /* 353*0e751525SEric Saxe * If the parent is at the top of the hierarchy, replace it's entry 354*0e751525SEric Saxe * in the root lgroup's group of top level PGs. 355*0e751525SEric Saxe */ 356*0e751525SEric Saxe if (parent->cmt_parent == NULL && 357*0e751525SEric Saxe parent->cmt_siblings != &cmt_root->cl_pgs) { 358*0e751525SEric Saxe if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 359*0e751525SEric Saxe != -1) { 360*0e751525SEric Saxe r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 361*0e751525SEric Saxe ASSERT(r != -1); 362*0e751525SEric Saxe } 363*0e751525SEric Saxe } 364*0e751525SEric Saxe 365*0e751525SEric Saxe /* 366*0e751525SEric Saxe * We assume (and therefore assert) that the PG being promoted is an 367*0e751525SEric Saxe * only child of it's parent. Update the parent's children set 368*0e751525SEric Saxe * replacing PG's entry with the parent (since the parent is becoming 369*0e751525SEric Saxe * the child). Then have PG and the parent swap children sets. 370*0e751525SEric Saxe */ 371*0e751525SEric Saxe ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 372*0e751525SEric Saxe if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 373*0e751525SEric Saxe r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 374*0e751525SEric Saxe ASSERT(r != -1); 375*0e751525SEric Saxe } 376*0e751525SEric Saxe 377*0e751525SEric Saxe children = pg->cmt_children; 378*0e751525SEric Saxe pg->cmt_children = parent->cmt_children; 379*0e751525SEric Saxe parent->cmt_children = children; 380*0e751525SEric Saxe 381*0e751525SEric Saxe /* 382*0e751525SEric Saxe * Update the sibling references for PG and it's parent 383*0e751525SEric Saxe */ 384*0e751525SEric Saxe pg->cmt_siblings = parent->cmt_siblings; 385*0e751525SEric Saxe parent->cmt_siblings = pg->cmt_children; 386*0e751525SEric Saxe 387*0e751525SEric Saxe /* 388*0e751525SEric Saxe * Update any cached lineages in the per CPU pg data. 389*0e751525SEric Saxe */ 390*0e751525SEric Saxe PG_CPU_ITR_INIT(pg, cpu_iter); 391*0e751525SEric Saxe while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 392*0e751525SEric Saxe int idx; 393*0e751525SEric Saxe group_t *pgs; 394*0e751525SEric Saxe pg_cmt_t *cpu_pg; 395*0e751525SEric Saxe 396*0e751525SEric Saxe /* 397*0e751525SEric Saxe * Iterate over the CPU's PGs updating the children 398*0e751525SEric Saxe * of the PG being promoted, since they have a new parent. 399*0e751525SEric Saxe */ 400*0e751525SEric Saxe pgs = &cpu->cpu_pg->pgs; 401*0e751525SEric Saxe group_iter_init(&iter); 402*0e751525SEric Saxe while ((cpu_pg = group_iterate(pgs, &iter)) != NULL) { 403*0e751525SEric Saxe if (cpu_pg->cmt_parent == pg) { 404*0e751525SEric Saxe cpu_pg->cmt_parent = parent; 405*0e751525SEric Saxe } 406*0e751525SEric Saxe } 407*0e751525SEric Saxe 408*0e751525SEric Saxe /* 409*0e751525SEric Saxe * Update the CMT load balancing lineage 410*0e751525SEric Saxe */ 411*0e751525SEric Saxe pgs = &cpu->cpu_pg->cmt_pgs; 412*0e751525SEric Saxe if ((idx = group_find(pgs, (void *)pg)) == -1) { 413*0e751525SEric Saxe /* 414*0e751525SEric Saxe * Unless this is the CPU who's lineage is being 415*0e751525SEric Saxe * constructed, the PG being promoted should be 416*0e751525SEric Saxe * in the lineage. 417*0e751525SEric Saxe */ 418*0e751525SEric Saxe ASSERT(GROUP_SIZE(pgs) == 0); 419*0e751525SEric Saxe continue; 420*0e751525SEric Saxe } 421*0e751525SEric Saxe 422*0e751525SEric Saxe ASSERT(GROUP_ACCESS(pgs, idx - 1) == parent); 423*0e751525SEric Saxe ASSERT(idx > 0); 424*0e751525SEric Saxe 425*0e751525SEric Saxe /* 426*0e751525SEric Saxe * Have the child and the parent swap places in the CPU's 427*0e751525SEric Saxe * lineage 428*0e751525SEric Saxe */ 429*0e751525SEric Saxe group_remove_at(pgs, idx); 430*0e751525SEric Saxe group_remove_at(pgs, idx - 1); 431*0e751525SEric Saxe err = group_add_at(pgs, parent, idx); 432*0e751525SEric Saxe ASSERT(err == 0); 433*0e751525SEric Saxe err = group_add_at(pgs, pg, idx - 1); 434*0e751525SEric Saxe ASSERT(err == 0); 435*0e751525SEric Saxe } 436*0e751525SEric Saxe 437*0e751525SEric Saxe /* 438*0e751525SEric Saxe * Update the parent references for PG and it's parent 439*0e751525SEric Saxe */ 440*0e751525SEric Saxe pg->cmt_parent = parent->cmt_parent; 441*0e751525SEric Saxe parent->cmt_parent = pg; 442*0e751525SEric Saxe 443*0e751525SEric Saxe start_cpus(); 444fb2f18f8Sesaxe } 445fb2f18f8Sesaxe 446fb2f18f8Sesaxe /* 447fb2f18f8Sesaxe * CMT class callback for a new CPU entering the system 448fb2f18f8Sesaxe */ 449fb2f18f8Sesaxe static void 450fb2f18f8Sesaxe pg_cmt_cpu_init(cpu_t *cp) 451fb2f18f8Sesaxe { 452fb2f18f8Sesaxe pg_cmt_t *pg; 453fb2f18f8Sesaxe group_t *cmt_pgs; 454*0e751525SEric Saxe int levels, level; 455fb2f18f8Sesaxe pghw_type_t hw; 456fb2f18f8Sesaxe pg_t *pg_cache = NULL; 457fb2f18f8Sesaxe pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 458fb2f18f8Sesaxe lgrp_handle_t lgrp_handle; 459fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 460fb2f18f8Sesaxe 461fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 462fb2f18f8Sesaxe 463*0e751525SEric Saxe if (cmt_sched_disabled) 464*0e751525SEric Saxe return; 465*0e751525SEric Saxe 466fb2f18f8Sesaxe /* 467fb2f18f8Sesaxe * A new CPU is coming into the system. 468fb2f18f8Sesaxe * Interrogate the platform to see if the CPU 469*0e751525SEric Saxe * has any performance or efficiency relevant 470*0e751525SEric Saxe * sharing relationships 471fb2f18f8Sesaxe */ 472fb2f18f8Sesaxe cmt_pgs = &cp->cpu_pg->cmt_pgs; 473fb2f18f8Sesaxe cp->cpu_pg->cmt_lineage = NULL; 474fb2f18f8Sesaxe 475fb2f18f8Sesaxe bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 476*0e751525SEric Saxe levels = 0; 477fb2f18f8Sesaxe for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 478fb2f18f8Sesaxe 479*0e751525SEric Saxe pg_cmt_policy_t policy; 480*0e751525SEric Saxe 481fb2f18f8Sesaxe /* 482*0e751525SEric Saxe * We're only interested in the hw sharing relationships 483*0e751525SEric Saxe * for which we know how to optimize. 484fb2f18f8Sesaxe */ 485*0e751525SEric Saxe policy = pg_cmt_policy(hw); 486*0e751525SEric Saxe if (policy == CMT_NO_POLICY || 487*0e751525SEric Saxe pg_plat_hw_shared(cp, hw) == 0) 488fb2f18f8Sesaxe continue; 489fb2f18f8Sesaxe 490fb2f18f8Sesaxe /* 491*0e751525SEric Saxe * Continue if the hardware sharing relationship has been 492*0e751525SEric Saxe * blacklisted. 493*0e751525SEric Saxe */ 494*0e751525SEric Saxe if (cmt_hw_blacklisted[hw]) { 495*0e751525SEric Saxe continue; 496*0e751525SEric Saxe } 497*0e751525SEric Saxe 498*0e751525SEric Saxe /* 499fb2f18f8Sesaxe * Find (or create) the PG associated with 500fb2f18f8Sesaxe * the hw sharing relationship in which cp 501fb2f18f8Sesaxe * belongs. 502fb2f18f8Sesaxe * 503fb2f18f8Sesaxe * Determine if a suitable PG already 504fb2f18f8Sesaxe * exists, or if one needs to be created. 505fb2f18f8Sesaxe */ 506fb2f18f8Sesaxe pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 507fb2f18f8Sesaxe if (pg == NULL) { 508fb2f18f8Sesaxe /* 509fb2f18f8Sesaxe * Create a new one. 510fb2f18f8Sesaxe * Initialize the common... 511fb2f18f8Sesaxe */ 512fb2f18f8Sesaxe pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 513fb2f18f8Sesaxe 514fb2f18f8Sesaxe /* ... physical ... */ 515fb2f18f8Sesaxe pghw_init((pghw_t *)pg, cp, hw); 516fb2f18f8Sesaxe 517fb2f18f8Sesaxe /* 518fb2f18f8Sesaxe * ... and CMT specific portions of the 519fb2f18f8Sesaxe * structure. 520fb2f18f8Sesaxe */ 521*0e751525SEric Saxe pg->cmt_policy = policy; 522*0e751525SEric Saxe 523*0e751525SEric Saxe /* CMT event callbacks */ 524*0e751525SEric Saxe cmt_callback_init((pg_t *)pg); 525*0e751525SEric Saxe 526fb2f18f8Sesaxe bitset_init(&pg->cmt_cpus_actv_set); 527fb2f18f8Sesaxe group_create(&pg->cmt_cpus_actv); 528fb2f18f8Sesaxe } else { 529fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg)); 530fb2f18f8Sesaxe } 531fb2f18f8Sesaxe 532fb2f18f8Sesaxe /* Add the CPU to the PG */ 533fb2f18f8Sesaxe pg_cpu_add((pg_t *)pg, cp); 534fb2f18f8Sesaxe 535fb2f18f8Sesaxe /* 5366890d023SEric Saxe * Ensure capacity of the active CPU group/bitset 537fb2f18f8Sesaxe */ 538fb2f18f8Sesaxe group_expand(&pg->cmt_cpus_actv, 539fb2f18f8Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 540fb2f18f8Sesaxe 541fb2f18f8Sesaxe if (cp->cpu_seqid >= 542fb2f18f8Sesaxe bitset_capacity(&pg->cmt_cpus_actv_set)) { 543fb2f18f8Sesaxe bitset_resize(&pg->cmt_cpus_actv_set, 544fb2f18f8Sesaxe cp->cpu_seqid + 1); 545fb2f18f8Sesaxe } 546fb2f18f8Sesaxe 547fb2f18f8Sesaxe /* 548*0e751525SEric Saxe * Build a lineage of CMT PGs for load balancing / coalescence 549fb2f18f8Sesaxe */ 550*0e751525SEric Saxe if (policy & (CMT_BALANCE | CMT_COALESCE)) { 551*0e751525SEric Saxe cpu_cmt_hier[levels++] = pg; 552fb2f18f8Sesaxe } 553fb2f18f8Sesaxe 554fb2f18f8Sesaxe /* Cache this for later */ 555fb2f18f8Sesaxe if (hw == PGHW_CACHE) 556fb2f18f8Sesaxe pg_cache = (pg_t *)pg; 557fb2f18f8Sesaxe } 558fb2f18f8Sesaxe 559*0e751525SEric Saxe group_expand(cmt_pgs, levels); 5606890d023SEric Saxe 5616890d023SEric Saxe if (cmt_root == NULL) 5626890d023SEric Saxe cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 563fb2f18f8Sesaxe 564fb2f18f8Sesaxe /* 565*0e751525SEric Saxe * Find the lgrp that encapsulates this CPU's CMT hierarchy 5666890d023SEric Saxe */ 5676890d023SEric Saxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 5686890d023SEric Saxe if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 5696890d023SEric Saxe lgrp = pg_cmt_lgrp_create(lgrp_handle); 5706890d023SEric Saxe 5716890d023SEric Saxe /* 572*0e751525SEric Saxe * Ascendingly sort the PGs in the lineage by number of CPUs 573*0e751525SEric Saxe */ 574*0e751525SEric Saxe pg_cmt_hier_sort(cpu_cmt_hier, levels); 575*0e751525SEric Saxe 576*0e751525SEric Saxe /* 577*0e751525SEric Saxe * Examine the lineage and validate it. 578*0e751525SEric Saxe * This routine will also try to fix the lineage along with the 579*0e751525SEric Saxe * rest of the PG hierarchy should it detect an issue. 580*0e751525SEric Saxe * 581*0e751525SEric Saxe * If it returns -1, an unrecoverable error has happened and we 582*0e751525SEric Saxe * need to return. 583*0e751525SEric Saxe */ 584*0e751525SEric Saxe if (pg_cmt_lineage_validate(cpu_cmt_hier, &levels) < 0) 585*0e751525SEric Saxe return; 586*0e751525SEric Saxe 587*0e751525SEric Saxe /* 588*0e751525SEric Saxe * For existing PGs in the lineage, verify that the parent is 589*0e751525SEric Saxe * correct, as the generation in the lineage may have changed 590*0e751525SEric Saxe * as a result of the sorting. Start the traversal at the top 591*0e751525SEric Saxe * of the lineage, moving down. 592*0e751525SEric Saxe */ 593*0e751525SEric Saxe for (level = levels - 1; level >= 0; ) { 594*0e751525SEric Saxe int reorg; 595*0e751525SEric Saxe 596*0e751525SEric Saxe reorg = 0; 597*0e751525SEric Saxe pg = cpu_cmt_hier[level]; 598*0e751525SEric Saxe 599*0e751525SEric Saxe /* 600*0e751525SEric Saxe * Promote PGs at an incorrect generation into place. 601*0e751525SEric Saxe */ 602*0e751525SEric Saxe while (pg->cmt_parent && 603*0e751525SEric Saxe pg->cmt_parent != cpu_cmt_hier[level + 1]) { 604*0e751525SEric Saxe cmt_hier_promote(pg); 605*0e751525SEric Saxe reorg++; 606*0e751525SEric Saxe } 607*0e751525SEric Saxe if (reorg > 0) 608*0e751525SEric Saxe level = levels - 1; 609*0e751525SEric Saxe else 610*0e751525SEric Saxe level--; 611*0e751525SEric Saxe } 612*0e751525SEric Saxe 613*0e751525SEric Saxe /* 6146890d023SEric Saxe * For each of the PGs in the CPU's lineage: 615*0e751525SEric Saxe * - Add an entry in the CPU sorted CMT PG group 616*0e751525SEric Saxe * which is used for top down CMT load balancing 617fb2f18f8Sesaxe * - Tie the PG into the CMT hierarchy by connecting 618fb2f18f8Sesaxe * it to it's parent and siblings. 619fb2f18f8Sesaxe */ 620*0e751525SEric Saxe for (level = 0; level < levels; level++) { 621fb2f18f8Sesaxe uint_t children; 622fb2f18f8Sesaxe int err; 623fb2f18f8Sesaxe 624fb2f18f8Sesaxe pg = cpu_cmt_hier[level]; 625*0e751525SEric Saxe err = group_add_at(cmt_pgs, pg, levels - level - 1); 626fb2f18f8Sesaxe ASSERT(err == 0); 627fb2f18f8Sesaxe 628fb2f18f8Sesaxe if (level == 0) 629fb2f18f8Sesaxe cp->cpu_pg->cmt_lineage = (pg_t *)pg; 630fb2f18f8Sesaxe 631fb2f18f8Sesaxe if (pg->cmt_siblings != NULL) { 632fb2f18f8Sesaxe /* Already initialized */ 633fb2f18f8Sesaxe ASSERT(pg->cmt_parent == NULL || 634fb2f18f8Sesaxe pg->cmt_parent == cpu_cmt_hier[level + 1]); 635fb2f18f8Sesaxe ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 636c416da2dSjb145095 ((pg->cmt_parent != NULL) && 637c416da2dSjb145095 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 638fb2f18f8Sesaxe continue; 639fb2f18f8Sesaxe } 640fb2f18f8Sesaxe 641*0e751525SEric Saxe if ((level + 1) == levels) { 642fb2f18f8Sesaxe pg->cmt_parent = NULL; 6436890d023SEric Saxe 644fb2f18f8Sesaxe pg->cmt_siblings = &lgrp->cl_pgs; 645fb2f18f8Sesaxe children = ++lgrp->cl_npgs; 646*0e751525SEric Saxe if (cmt_root != lgrp) 6476890d023SEric Saxe cmt_root->cl_npgs++; 648fb2f18f8Sesaxe } else { 649fb2f18f8Sesaxe pg->cmt_parent = cpu_cmt_hier[level + 1]; 650fb2f18f8Sesaxe 651fb2f18f8Sesaxe /* 652fb2f18f8Sesaxe * A good parent keeps track of their children. 653fb2f18f8Sesaxe * The parent's children group is also the PG's 654fb2f18f8Sesaxe * siblings. 655fb2f18f8Sesaxe */ 656fb2f18f8Sesaxe if (pg->cmt_parent->cmt_children == NULL) { 657fb2f18f8Sesaxe pg->cmt_parent->cmt_children = 658fb2f18f8Sesaxe kmem_zalloc(sizeof (group_t), KM_SLEEP); 659fb2f18f8Sesaxe group_create(pg->cmt_parent->cmt_children); 660fb2f18f8Sesaxe } 661fb2f18f8Sesaxe pg->cmt_siblings = pg->cmt_parent->cmt_children; 662fb2f18f8Sesaxe children = ++pg->cmt_parent->cmt_nchildren; 663fb2f18f8Sesaxe } 6646890d023SEric Saxe 665fb2f18f8Sesaxe group_expand(pg->cmt_siblings, children); 6666890d023SEric Saxe group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 667fb2f18f8Sesaxe } 668fb2f18f8Sesaxe 669fb2f18f8Sesaxe /* 670fb2f18f8Sesaxe * Cache the chip and core IDs in the cpu_t->cpu_physid structure 671fb2f18f8Sesaxe * for fast lookups later. 672fb2f18f8Sesaxe */ 673fb2f18f8Sesaxe if (cp->cpu_physid) { 674fb2f18f8Sesaxe cp->cpu_physid->cpu_chipid = 675fb2f18f8Sesaxe pg_plat_hw_instance_id(cp, PGHW_CHIP); 676fb2f18f8Sesaxe cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 677fb2f18f8Sesaxe 678fb2f18f8Sesaxe /* 679fb2f18f8Sesaxe * If this cpu has a PG representing shared cache, then set 680fb2f18f8Sesaxe * cpu_cacheid to that PG's logical id 681fb2f18f8Sesaxe */ 682fb2f18f8Sesaxe if (pg_cache) 683fb2f18f8Sesaxe cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 684fb2f18f8Sesaxe } 685fb2f18f8Sesaxe 686fb2f18f8Sesaxe /* CPU0 only initialization */ 687fb2f18f8Sesaxe if (is_cpu0) { 688fb2f18f8Sesaxe pg_cmt_cpu_startup(cp); 689fb2f18f8Sesaxe is_cpu0 = 0; 690a6604450Sesaxe cpu0_lgrp = lgrp; 691fb2f18f8Sesaxe } 692fb2f18f8Sesaxe 693fb2f18f8Sesaxe } 694fb2f18f8Sesaxe 695fb2f18f8Sesaxe /* 696fb2f18f8Sesaxe * Class callback when a CPU is leaving the system (deletion) 697fb2f18f8Sesaxe */ 698fb2f18f8Sesaxe static void 699fb2f18f8Sesaxe pg_cmt_cpu_fini(cpu_t *cp) 700fb2f18f8Sesaxe { 701fb2f18f8Sesaxe group_iter_t i; 702fb2f18f8Sesaxe pg_cmt_t *pg; 703fb2f18f8Sesaxe group_t *pgs, *cmt_pgs; 704fb2f18f8Sesaxe lgrp_handle_t lgrp_handle; 705fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 706fb2f18f8Sesaxe 707*0e751525SEric Saxe if (cmt_sched_disabled) 708*0e751525SEric Saxe return; 709*0e751525SEric Saxe 710fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 711fb2f18f8Sesaxe cmt_pgs = &cp->cpu_pg->cmt_pgs; 712fb2f18f8Sesaxe 713fb2f18f8Sesaxe /* 714fb2f18f8Sesaxe * Find the lgroup that encapsulates this CPU's CMT hierarchy 715fb2f18f8Sesaxe */ 716fb2f18f8Sesaxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 717a6604450Sesaxe 718fb2f18f8Sesaxe lgrp = pg_cmt_find_lgrp(lgrp_handle); 7193e81cacfSEric Saxe if (ncpus == 1 && lgrp != cpu0_lgrp) { 720a6604450Sesaxe /* 7213e81cacfSEric Saxe * One might wonder how we could be deconfiguring the 7223e81cacfSEric Saxe * only CPU in the system. 723a6604450Sesaxe * 7243e81cacfSEric Saxe * On Starcat systems when null_proc_lpa is detected, 7253e81cacfSEric Saxe * the boot CPU (which is already configured into a leaf 7263e81cacfSEric Saxe * lgroup), is moved into the root lgroup. This is done by 7273e81cacfSEric Saxe * deconfiguring it from both lgroups and processor 7283e81cacfSEric Saxe * groups), and then later reconfiguring it back in. This 7293e81cacfSEric Saxe * call to pg_cmt_cpu_fini() is part of that deconfiguration. 7303e81cacfSEric Saxe * 7313e81cacfSEric Saxe * This special case is detected by noting that the platform 7323e81cacfSEric Saxe * has changed the CPU's lgrp affiliation (since it now 7333e81cacfSEric Saxe * belongs in the root). In this case, use the cmt_lgrp_t 7343e81cacfSEric Saxe * cached for the boot CPU, since this is what needs to be 7353e81cacfSEric Saxe * torn down. 736a6604450Sesaxe */ 737a6604450Sesaxe lgrp = cpu0_lgrp; 738a6604450Sesaxe } 739fb2f18f8Sesaxe 7403e81cacfSEric Saxe ASSERT(lgrp != NULL); 7413e81cacfSEric Saxe 742fb2f18f8Sesaxe /* 743fb2f18f8Sesaxe * First, clean up anything load balancing specific for each of 744fb2f18f8Sesaxe * the CPU's PGs that participated in CMT load balancing 745fb2f18f8Sesaxe */ 746fb2f18f8Sesaxe pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 747fb2f18f8Sesaxe while (pg != NULL) { 748fb2f18f8Sesaxe 749fb2f18f8Sesaxe /* 750fb2f18f8Sesaxe * Remove the PG from the CPU's load balancing lineage 751fb2f18f8Sesaxe */ 752fb2f18f8Sesaxe (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 753fb2f18f8Sesaxe 754fb2f18f8Sesaxe /* 755fb2f18f8Sesaxe * If it's about to become empty, destroy it's children 756fb2f18f8Sesaxe * group, and remove it's reference from it's siblings. 757fb2f18f8Sesaxe * This is done here (rather than below) to avoid removing 758fb2f18f8Sesaxe * our reference from a PG that we just eliminated. 759fb2f18f8Sesaxe */ 760fb2f18f8Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 761fb2f18f8Sesaxe if (pg->cmt_children != NULL) 762fb2f18f8Sesaxe group_destroy(pg->cmt_children); 763fb2f18f8Sesaxe if (pg->cmt_siblings != NULL) { 764fb2f18f8Sesaxe if (pg->cmt_siblings == &lgrp->cl_pgs) 765fb2f18f8Sesaxe lgrp->cl_npgs--; 766fb2f18f8Sesaxe else 767fb2f18f8Sesaxe pg->cmt_parent->cmt_nchildren--; 768fb2f18f8Sesaxe } 769fb2f18f8Sesaxe } 770fb2f18f8Sesaxe pg = pg->cmt_parent; 771fb2f18f8Sesaxe } 772fb2f18f8Sesaxe ASSERT(GROUP_SIZE(cmt_pgs) == 0); 773fb2f18f8Sesaxe 774fb2f18f8Sesaxe /* 775fb2f18f8Sesaxe * Now that the load balancing lineage updates have happened, 776fb2f18f8Sesaxe * remove the CPU from all it's PGs (destroying any that become 777fb2f18f8Sesaxe * empty). 778fb2f18f8Sesaxe */ 779fb2f18f8Sesaxe group_iter_init(&i); 780fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 781fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 782fb2f18f8Sesaxe continue; 783fb2f18f8Sesaxe 784fb2f18f8Sesaxe pg_cpu_delete((pg_t *)pg, cp); 785fb2f18f8Sesaxe /* 786fb2f18f8Sesaxe * Deleting the CPU from the PG changes the CPU's 787fb2f18f8Sesaxe * PG group over which we are actively iterating 788fb2f18f8Sesaxe * Re-initialize the iteration 789fb2f18f8Sesaxe */ 790fb2f18f8Sesaxe group_iter_init(&i); 791fb2f18f8Sesaxe 792fb2f18f8Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 793fb2f18f8Sesaxe 794fb2f18f8Sesaxe /* 795fb2f18f8Sesaxe * The PG has become zero sized, so destroy it. 796fb2f18f8Sesaxe */ 797fb2f18f8Sesaxe group_destroy(&pg->cmt_cpus_actv); 798fb2f18f8Sesaxe bitset_fini(&pg->cmt_cpus_actv_set); 799fb2f18f8Sesaxe pghw_fini((pghw_t *)pg); 800fb2f18f8Sesaxe 801fb2f18f8Sesaxe pg_destroy((pg_t *)pg); 802fb2f18f8Sesaxe } 803fb2f18f8Sesaxe } 804fb2f18f8Sesaxe } 805fb2f18f8Sesaxe 806fb2f18f8Sesaxe /* 807fb2f18f8Sesaxe * Class callback when a CPU is entering a cpu partition 808fb2f18f8Sesaxe */ 809fb2f18f8Sesaxe static void 810fb2f18f8Sesaxe pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 811fb2f18f8Sesaxe { 812fb2f18f8Sesaxe group_t *pgs; 813fb2f18f8Sesaxe pg_t *pg; 814fb2f18f8Sesaxe group_iter_t i; 815fb2f18f8Sesaxe 816fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 817fb2f18f8Sesaxe 818*0e751525SEric Saxe if (cmt_sched_disabled) 819*0e751525SEric Saxe return; 820*0e751525SEric Saxe 821fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 822fb2f18f8Sesaxe 823fb2f18f8Sesaxe /* 824fb2f18f8Sesaxe * Ensure that the new partition's PG bitset 825fb2f18f8Sesaxe * is large enough for all CMT PG's to which cp 826fb2f18f8Sesaxe * belongs 827fb2f18f8Sesaxe */ 828fb2f18f8Sesaxe group_iter_init(&i); 829fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 830fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 831fb2f18f8Sesaxe continue; 832fb2f18f8Sesaxe 833fb2f18f8Sesaxe if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 834fb2f18f8Sesaxe bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 835fb2f18f8Sesaxe } 836fb2f18f8Sesaxe } 837fb2f18f8Sesaxe 838fb2f18f8Sesaxe /* 839fb2f18f8Sesaxe * Class callback when a CPU is actually moving partitions 840fb2f18f8Sesaxe */ 841fb2f18f8Sesaxe static void 842fb2f18f8Sesaxe pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 843fb2f18f8Sesaxe { 844fb2f18f8Sesaxe cpu_t *cpp; 845fb2f18f8Sesaxe group_t *pgs; 846fb2f18f8Sesaxe pg_t *pg; 847fb2f18f8Sesaxe group_iter_t pg_iter; 848fb2f18f8Sesaxe pg_cpu_itr_t cpu_iter; 849fb2f18f8Sesaxe boolean_t found; 850fb2f18f8Sesaxe 851fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 852fb2f18f8Sesaxe 853*0e751525SEric Saxe if (cmt_sched_disabled) 854*0e751525SEric Saxe return; 855*0e751525SEric Saxe 856fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 857fb2f18f8Sesaxe group_iter_init(&pg_iter); 858fb2f18f8Sesaxe 859fb2f18f8Sesaxe /* 860fb2f18f8Sesaxe * Iterate over the CPUs CMT PGs 861fb2f18f8Sesaxe */ 862fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 863fb2f18f8Sesaxe 864fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 865fb2f18f8Sesaxe continue; 866fb2f18f8Sesaxe 867fb2f18f8Sesaxe /* 868fb2f18f8Sesaxe * Add the PG to the bitset in the new partition. 869fb2f18f8Sesaxe */ 870fb2f18f8Sesaxe bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 871fb2f18f8Sesaxe 872fb2f18f8Sesaxe /* 873fb2f18f8Sesaxe * Remove the PG from the bitset in the old partition 874fb2f18f8Sesaxe * if the last of the PG's CPUs have left. 875fb2f18f8Sesaxe */ 876fb2f18f8Sesaxe found = B_FALSE; 877fb2f18f8Sesaxe PG_CPU_ITR_INIT(pg, cpu_iter); 878fb2f18f8Sesaxe while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 879fb2f18f8Sesaxe if (cpp == cp) 880fb2f18f8Sesaxe continue; 881a6604450Sesaxe if (CPU_ACTIVE(cpp) && 882a6604450Sesaxe cpp->cpu_part->cp_id == oldpp->cp_id) { 883fb2f18f8Sesaxe found = B_TRUE; 884fb2f18f8Sesaxe break; 885fb2f18f8Sesaxe } 886fb2f18f8Sesaxe } 887fb2f18f8Sesaxe if (!found) 888fb2f18f8Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 889fb2f18f8Sesaxe } 890fb2f18f8Sesaxe } 891fb2f18f8Sesaxe 892fb2f18f8Sesaxe /* 893fb2f18f8Sesaxe * Class callback when a CPU becomes active (online) 894fb2f18f8Sesaxe * 895fb2f18f8Sesaxe * This is called in a context where CPUs are paused 896fb2f18f8Sesaxe */ 897fb2f18f8Sesaxe static void 898fb2f18f8Sesaxe pg_cmt_cpu_active(cpu_t *cp) 899fb2f18f8Sesaxe { 900fb2f18f8Sesaxe int err; 901fb2f18f8Sesaxe group_iter_t i; 902fb2f18f8Sesaxe pg_cmt_t *pg; 903fb2f18f8Sesaxe group_t *pgs; 904fb2f18f8Sesaxe 905fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 906fb2f18f8Sesaxe 907*0e751525SEric Saxe if (cmt_sched_disabled) 908*0e751525SEric Saxe return; 909*0e751525SEric Saxe 910fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 911fb2f18f8Sesaxe group_iter_init(&i); 912fb2f18f8Sesaxe 913fb2f18f8Sesaxe /* 914fb2f18f8Sesaxe * Iterate over the CPU's PGs 915fb2f18f8Sesaxe */ 916fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 917fb2f18f8Sesaxe 918fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 919fb2f18f8Sesaxe continue; 920fb2f18f8Sesaxe 921fb2f18f8Sesaxe err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 922fb2f18f8Sesaxe ASSERT(err == 0); 923fb2f18f8Sesaxe 924fb2f18f8Sesaxe /* 925fb2f18f8Sesaxe * If this is the first active CPU in the PG, and it 926fb2f18f8Sesaxe * represents a hardware sharing relationship over which 927fb2f18f8Sesaxe * CMT load balancing is performed, add it as a candidate 928fb2f18f8Sesaxe * for balancing with it's siblings. 929fb2f18f8Sesaxe */ 930fb2f18f8Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 931*0e751525SEric Saxe (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 932fb2f18f8Sesaxe err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 933fb2f18f8Sesaxe ASSERT(err == 0); 9346890d023SEric Saxe 9356890d023SEric Saxe /* 9366890d023SEric Saxe * If this is a top level PG, add it as a balancing 937*0e751525SEric Saxe * candidate when balancing within the root lgroup. 9386890d023SEric Saxe */ 939*0e751525SEric Saxe if (pg->cmt_parent == NULL && 940*0e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) { 9416890d023SEric Saxe err = group_add(&cmt_root->cl_pgs, pg, 9426890d023SEric Saxe GRP_NORESIZE); 9436890d023SEric Saxe ASSERT(err == 0); 9446890d023SEric Saxe } 945fb2f18f8Sesaxe } 946fb2f18f8Sesaxe 947fb2f18f8Sesaxe /* 948fb2f18f8Sesaxe * Notate the CPU in the PGs active CPU bitset. 949fb2f18f8Sesaxe * Also notate the PG as being active in it's associated 950fb2f18f8Sesaxe * partition 951fb2f18f8Sesaxe */ 952fb2f18f8Sesaxe bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 953fb2f18f8Sesaxe bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 954fb2f18f8Sesaxe } 955fb2f18f8Sesaxe } 956fb2f18f8Sesaxe 957fb2f18f8Sesaxe /* 958fb2f18f8Sesaxe * Class callback when a CPU goes inactive (offline) 959fb2f18f8Sesaxe * 960fb2f18f8Sesaxe * This is called in a context where CPUs are paused 961fb2f18f8Sesaxe */ 962fb2f18f8Sesaxe static void 963fb2f18f8Sesaxe pg_cmt_cpu_inactive(cpu_t *cp) 964fb2f18f8Sesaxe { 965fb2f18f8Sesaxe int err; 966fb2f18f8Sesaxe group_t *pgs; 967fb2f18f8Sesaxe pg_cmt_t *pg; 968fb2f18f8Sesaxe cpu_t *cpp; 969fb2f18f8Sesaxe group_iter_t i; 970fb2f18f8Sesaxe pg_cpu_itr_t cpu_itr; 971fb2f18f8Sesaxe boolean_t found; 972fb2f18f8Sesaxe 973fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 974fb2f18f8Sesaxe 975*0e751525SEric Saxe if (cmt_sched_disabled) 976*0e751525SEric Saxe return; 977*0e751525SEric Saxe 978fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 979fb2f18f8Sesaxe group_iter_init(&i); 980fb2f18f8Sesaxe 981fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 982fb2f18f8Sesaxe 983fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 984fb2f18f8Sesaxe continue; 985fb2f18f8Sesaxe 986fb2f18f8Sesaxe /* 987fb2f18f8Sesaxe * Remove the CPU from the CMT PGs active CPU group 988fb2f18f8Sesaxe * bitmap 989fb2f18f8Sesaxe */ 990fb2f18f8Sesaxe err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 991fb2f18f8Sesaxe ASSERT(err == 0); 992fb2f18f8Sesaxe 993fb2f18f8Sesaxe bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 994fb2f18f8Sesaxe 995fb2f18f8Sesaxe /* 996fb2f18f8Sesaxe * If there are no more active CPUs in this PG over which 997fb2f18f8Sesaxe * load was balanced, remove it as a balancing candidate. 998fb2f18f8Sesaxe */ 999fb2f18f8Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 1000*0e751525SEric Saxe (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1001fb2f18f8Sesaxe err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1002fb2f18f8Sesaxe ASSERT(err == 0); 10036890d023SEric Saxe 1004*0e751525SEric Saxe if (pg->cmt_parent == NULL && 1005*0e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) { 10066890d023SEric Saxe err = group_remove(&cmt_root->cl_pgs, pg, 10076890d023SEric Saxe GRP_NORESIZE); 10086890d023SEric Saxe ASSERT(err == 0); 10096890d023SEric Saxe } 1010fb2f18f8Sesaxe } 1011fb2f18f8Sesaxe 1012fb2f18f8Sesaxe /* 1013fb2f18f8Sesaxe * Assert the number of active CPUs does not exceed 1014fb2f18f8Sesaxe * the total number of CPUs in the PG 1015fb2f18f8Sesaxe */ 1016fb2f18f8Sesaxe ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1017fb2f18f8Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1018fb2f18f8Sesaxe 1019fb2f18f8Sesaxe /* 1020fb2f18f8Sesaxe * Update the PG bitset in the CPU's old partition 1021fb2f18f8Sesaxe */ 1022fb2f18f8Sesaxe found = B_FALSE; 1023fb2f18f8Sesaxe PG_CPU_ITR_INIT(pg, cpu_itr); 1024fb2f18f8Sesaxe while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1025fb2f18f8Sesaxe if (cpp == cp) 1026fb2f18f8Sesaxe continue; 1027a6604450Sesaxe if (CPU_ACTIVE(cpp) && 1028a6604450Sesaxe cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1029fb2f18f8Sesaxe found = B_TRUE; 1030fb2f18f8Sesaxe break; 1031fb2f18f8Sesaxe } 1032fb2f18f8Sesaxe } 1033fb2f18f8Sesaxe if (!found) { 1034fb2f18f8Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs, 1035fb2f18f8Sesaxe ((pg_t *)pg)->pg_id); 1036fb2f18f8Sesaxe } 1037fb2f18f8Sesaxe } 1038fb2f18f8Sesaxe } 1039fb2f18f8Sesaxe 1040fb2f18f8Sesaxe /* 1041fb2f18f8Sesaxe * Return non-zero if the CPU belongs in the given PG 1042fb2f18f8Sesaxe */ 1043fb2f18f8Sesaxe static int 1044fb2f18f8Sesaxe pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1045fb2f18f8Sesaxe { 1046fb2f18f8Sesaxe cpu_t *pg_cpu; 1047fb2f18f8Sesaxe 1048fb2f18f8Sesaxe pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1049fb2f18f8Sesaxe 1050fb2f18f8Sesaxe ASSERT(pg_cpu != NULL); 1051fb2f18f8Sesaxe 1052fb2f18f8Sesaxe /* 1053fb2f18f8Sesaxe * The CPU belongs if, given the nature of the hardware sharing 1054fb2f18f8Sesaxe * relationship represented by the PG, the CPU has that 1055fb2f18f8Sesaxe * relationship with some other CPU already in the PG 1056fb2f18f8Sesaxe */ 1057fb2f18f8Sesaxe if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1058fb2f18f8Sesaxe return (1); 1059fb2f18f8Sesaxe 1060fb2f18f8Sesaxe return (0); 1061fb2f18f8Sesaxe } 1062fb2f18f8Sesaxe 1063fb2f18f8Sesaxe /* 1064*0e751525SEric Saxe * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1065fb2f18f8Sesaxe */ 1066fb2f18f8Sesaxe static void 1067*0e751525SEric Saxe pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1068fb2f18f8Sesaxe { 1069*0e751525SEric Saxe int i, j, inc; 1070*0e751525SEric Saxe pg_t *tmp; 1071*0e751525SEric Saxe pg_t **h = (pg_t **)hier; 1072fb2f18f8Sesaxe 1073*0e751525SEric Saxe /* 1074*0e751525SEric Saxe * First sort by number of CPUs 1075*0e751525SEric Saxe */ 1076*0e751525SEric Saxe inc = size / 2; 1077*0e751525SEric Saxe while (inc > 0) { 1078*0e751525SEric Saxe for (i = inc; i < size; i++) { 1079*0e751525SEric Saxe j = i; 1080*0e751525SEric Saxe tmp = h[i]; 1081*0e751525SEric Saxe while ((j >= inc) && 1082*0e751525SEric Saxe (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 1083*0e751525SEric Saxe h[j] = h[j - inc]; 1084*0e751525SEric Saxe j = j - inc; 1085*0e751525SEric Saxe } 1086*0e751525SEric Saxe h[j] = tmp; 1087*0e751525SEric Saxe } 1088*0e751525SEric Saxe if (inc == 2) 1089*0e751525SEric Saxe inc = 1; 1090*0e751525SEric Saxe else 1091*0e751525SEric Saxe inc = (inc * 5) / 11; 1092*0e751525SEric Saxe } 1093fb2f18f8Sesaxe 1094*0e751525SEric Saxe /* 1095*0e751525SEric Saxe * Break ties by asking the platform. 1096*0e751525SEric Saxe * Determine if h[i] outranks h[i + 1] and if so, swap them. 1097*0e751525SEric Saxe */ 1098*0e751525SEric Saxe for (i = 0; i < size - 1; i++) { 1099*0e751525SEric Saxe if ((PG_NUM_CPUS(h[i]) == PG_NUM_CPUS(h[i + 1])) && 1100*0e751525SEric Saxe pg_cmt_hier_rank(hier[i], hier[i + 1]) == hier[i]) { 1101*0e751525SEric Saxe tmp = h[i]; 1102*0e751525SEric Saxe h[i] = h[i + 1]; 1103*0e751525SEric Saxe h[i + 1] = tmp; 1104fb2f18f8Sesaxe } 1105fb2f18f8Sesaxe } 1106fb2f18f8Sesaxe } 1107fb2f18f8Sesaxe 1108fb2f18f8Sesaxe /* 1109fb2f18f8Sesaxe * Return a cmt_lgrp_t * given an lgroup handle. 1110fb2f18f8Sesaxe */ 1111fb2f18f8Sesaxe static cmt_lgrp_t * 1112fb2f18f8Sesaxe pg_cmt_find_lgrp(lgrp_handle_t hand) 1113fb2f18f8Sesaxe { 1114fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 1115fb2f18f8Sesaxe 1116fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 1117fb2f18f8Sesaxe 1118fb2f18f8Sesaxe lgrp = cmt_lgrps; 1119fb2f18f8Sesaxe while (lgrp != NULL) { 1120fb2f18f8Sesaxe if (lgrp->cl_hand == hand) 1121a6604450Sesaxe break; 1122fb2f18f8Sesaxe lgrp = lgrp->cl_next; 1123fb2f18f8Sesaxe } 1124a6604450Sesaxe return (lgrp); 1125a6604450Sesaxe } 1126fb2f18f8Sesaxe 1127fb2f18f8Sesaxe /* 1128a6604450Sesaxe * Create a cmt_lgrp_t with the specified handle. 1129fb2f18f8Sesaxe */ 1130a6604450Sesaxe static cmt_lgrp_t * 1131a6604450Sesaxe pg_cmt_lgrp_create(lgrp_handle_t hand) 1132a6604450Sesaxe { 1133a6604450Sesaxe cmt_lgrp_t *lgrp; 1134a6604450Sesaxe 1135a6604450Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 1136a6604450Sesaxe 1137fb2f18f8Sesaxe lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1138fb2f18f8Sesaxe 1139fb2f18f8Sesaxe lgrp->cl_hand = hand; 1140fb2f18f8Sesaxe lgrp->cl_npgs = 0; 1141fb2f18f8Sesaxe lgrp->cl_next = cmt_lgrps; 1142fb2f18f8Sesaxe cmt_lgrps = lgrp; 1143fb2f18f8Sesaxe group_create(&lgrp->cl_pgs); 1144fb2f18f8Sesaxe 1145fb2f18f8Sesaxe return (lgrp); 1146fb2f18f8Sesaxe } 11476890d023SEric Saxe 11486890d023SEric Saxe /* 1149*0e751525SEric Saxe * Interfaces to enable and disable power aware dispatching 1150*0e751525SEric Saxe * The caller must be holding cpu_lock. 11516890d023SEric Saxe * 1152*0e751525SEric Saxe * Return 0 on success and -1 on failure. 11536890d023SEric Saxe */ 1154*0e751525SEric Saxe int 1155*0e751525SEric Saxe cmt_pad_enable(pghw_type_t type) 11566890d023SEric Saxe { 1157*0e751525SEric Saxe group_t *hwset; 1158*0e751525SEric Saxe group_iter_t iter; 1159*0e751525SEric Saxe pg_cmt_t *pg; 11606890d023SEric Saxe 1161*0e751525SEric Saxe ASSERT(PGHW_IS_PM_DOMAIN(type)); 1162*0e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 11636890d023SEric Saxe 1164*0e751525SEric Saxe if ((hwset = pghw_set_lookup(type)) == NULL || 1165*0e751525SEric Saxe cmt_hw_blacklisted[type]) { 1166*0e751525SEric Saxe /* 1167*0e751525SEric Saxe * Unable to find any instances of the specified type 1168*0e751525SEric Saxe * of power domain, or the power domains have been blacklisted. 1169*0e751525SEric Saxe */ 1170*0e751525SEric Saxe return (-1); 1171*0e751525SEric Saxe } 11726890d023SEric Saxe 11736890d023SEric Saxe /* 1174*0e751525SEric Saxe * Iterate over the power domains, setting the default dispatcher 1175*0e751525SEric Saxe * policy for power/performance optimization. 1176*0e751525SEric Saxe * 1177*0e751525SEric Saxe * Simply setting the policy isn't enough in the case where the power 1178*0e751525SEric Saxe * domain is an only child of another PG. Because the dispatcher walks 1179*0e751525SEric Saxe * the PG hierarchy in a top down fashion, the higher up PG's policy 1180*0e751525SEric Saxe * will dominate. So promote the power domain above it's parent if both 1181*0e751525SEric Saxe * PG and it's parent have the same CPUs to ensure it's policy 1182*0e751525SEric Saxe * dominates. 11836890d023SEric Saxe */ 1184*0e751525SEric Saxe group_iter_init(&iter); 1185*0e751525SEric Saxe while ((pg = group_iterate(hwset, &iter)) != NULL) { 1186*0e751525SEric Saxe /* 1187*0e751525SEric Saxe * If the power domain is an only child to a parent 1188*0e751525SEric Saxe * not implementing the same policy, promote the child 1189*0e751525SEric Saxe * above the parent to activate the policy. 1190*0e751525SEric Saxe */ 1191*0e751525SEric Saxe pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 1192*0e751525SEric Saxe while ((pg->cmt_parent != NULL) && 1193*0e751525SEric Saxe (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 1194*0e751525SEric Saxe (PG_NUM_CPUS((pg_t *)pg) == 1195*0e751525SEric Saxe PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 1196*0e751525SEric Saxe cmt_hier_promote(pg); 1197*0e751525SEric Saxe } 1198*0e751525SEric Saxe } 1199*0e751525SEric Saxe 1200*0e751525SEric Saxe return (0); 1201*0e751525SEric Saxe } 1202*0e751525SEric Saxe 1203*0e751525SEric Saxe int 1204*0e751525SEric Saxe cmt_pad_disable(pghw_type_t type) 1205*0e751525SEric Saxe { 1206*0e751525SEric Saxe group_t *hwset; 1207*0e751525SEric Saxe group_iter_t iter; 1208*0e751525SEric Saxe pg_cmt_t *pg; 1209*0e751525SEric Saxe pg_cmt_t *child; 1210*0e751525SEric Saxe 1211*0e751525SEric Saxe ASSERT(PGHW_IS_PM_DOMAIN(type)); 1212*0e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 1213*0e751525SEric Saxe 1214*0e751525SEric Saxe if ((hwset = pghw_set_lookup(type)) == NULL) { 1215*0e751525SEric Saxe /* 1216*0e751525SEric Saxe * Unable to find any instances of the specified type of 1217*0e751525SEric Saxe * power domain. 1218*0e751525SEric Saxe */ 1219*0e751525SEric Saxe return (-1); 1220*0e751525SEric Saxe } 1221*0e751525SEric Saxe /* 1222*0e751525SEric Saxe * Iterate over the power domains, setting the default dispatcher 1223*0e751525SEric Saxe * policy for performance optimization (load balancing). 1224*0e751525SEric Saxe */ 1225*0e751525SEric Saxe group_iter_init(&iter); 1226*0e751525SEric Saxe while ((pg = group_iterate(hwset, &iter)) != NULL) { 1227*0e751525SEric Saxe 1228*0e751525SEric Saxe /* 1229*0e751525SEric Saxe * If the power domain has an only child that implements 1230*0e751525SEric Saxe * policy other than load balancing, promote the child 1231*0e751525SEric Saxe * above the power domain to ensure it's policy dominates. 1232*0e751525SEric Saxe */ 1233*0e751525SEric Saxe if (GROUP_SIZE(pg->cmt_children) == 1) { 1234*0e751525SEric Saxe child = GROUP_ACCESS(pg->cmt_children, 0); 1235*0e751525SEric Saxe if ((child->cmt_policy & CMT_BALANCE) == 0) { 1236*0e751525SEric Saxe cmt_hier_promote(child); 1237*0e751525SEric Saxe } 1238*0e751525SEric Saxe } 1239*0e751525SEric Saxe pg->cmt_policy = CMT_BALANCE; 1240*0e751525SEric Saxe } 1241*0e751525SEric Saxe return (0); 1242*0e751525SEric Saxe } 1243*0e751525SEric Saxe 1244*0e751525SEric Saxe /* ARGSUSED */ 1245*0e751525SEric Saxe static void 1246*0e751525SEric Saxe cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1247*0e751525SEric Saxe kthread_t *new) 1248*0e751525SEric Saxe { 1249*0e751525SEric Saxe pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 1250*0e751525SEric Saxe 1251*0e751525SEric Saxe if (old == cp->cpu_idle_thread) { 1252*0e751525SEric Saxe atomic_add_32(&cmt_pg->cmt_utilization, 1); 1253*0e751525SEric Saxe } else if (new == cp->cpu_idle_thread) { 1254*0e751525SEric Saxe atomic_add_32(&cmt_pg->cmt_utilization, -1); 1255*0e751525SEric Saxe } 1256*0e751525SEric Saxe } 1257*0e751525SEric Saxe 1258*0e751525SEric Saxe /* 1259*0e751525SEric Saxe * Macro to test whether a thread is currently runnable on a CPU in a PG. 1260*0e751525SEric Saxe */ 1261*0e751525SEric Saxe #define THREAD_RUNNABLE_IN_PG(t, pg) \ 1262*0e751525SEric Saxe ((t)->t_state == TS_RUN && \ 1263*0e751525SEric Saxe (t)->t_disp_queue->disp_cpu && \ 1264*0e751525SEric Saxe bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 1265*0e751525SEric Saxe (t)->t_disp_queue->disp_cpu->cpu_seqid)) 1266*0e751525SEric Saxe 1267*0e751525SEric Saxe static void 1268*0e751525SEric Saxe cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 1269*0e751525SEric Saxe kthread_t *new) 1270*0e751525SEric Saxe { 1271*0e751525SEric Saxe pg_cmt_t *cmt = (pg_cmt_t *)pg; 1272*0e751525SEric Saxe cpupm_domain_t *dom; 1273*0e751525SEric Saxe uint32_t u; 1274*0e751525SEric Saxe 1275*0e751525SEric Saxe if (old == cp->cpu_idle_thread) { 1276*0e751525SEric Saxe ASSERT(new != cp->cpu_idle_thread); 1277*0e751525SEric Saxe u = atomic_add_32_nv(&cmt->cmt_utilization, 1); 1278*0e751525SEric Saxe if (u == 1) { 1279*0e751525SEric Saxe /* 1280*0e751525SEric Saxe * Notify the CPU power manager that the domain 1281*0e751525SEric Saxe * is non-idle. 1282*0e751525SEric Saxe */ 1283*0e751525SEric Saxe dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1284*0e751525SEric Saxe cpupm_utilization_event(cp, now, dom, 1285*0e751525SEric Saxe CPUPM_DOM_BUSY_FROM_IDLE); 1286*0e751525SEric Saxe } 1287*0e751525SEric Saxe } else if (new == cp->cpu_idle_thread) { 1288*0e751525SEric Saxe ASSERT(old != cp->cpu_idle_thread); 1289*0e751525SEric Saxe u = atomic_add_32_nv(&cmt->cmt_utilization, -1); 1290*0e751525SEric Saxe if (u == 0) { 1291*0e751525SEric Saxe /* 1292*0e751525SEric Saxe * The domain is idle, notify the CPU power 1293*0e751525SEric Saxe * manager. 1294*0e751525SEric Saxe * 1295*0e751525SEric Saxe * Avoid notifying if the thread is simply migrating 1296*0e751525SEric Saxe * between CPUs in the domain. 1297*0e751525SEric Saxe */ 1298*0e751525SEric Saxe if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 1299*0e751525SEric Saxe dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1300*0e751525SEric Saxe cpupm_utilization_event(cp, now, dom, 1301*0e751525SEric Saxe CPUPM_DOM_IDLE_FROM_BUSY); 1302*0e751525SEric Saxe } 1303*0e751525SEric Saxe } 1304*0e751525SEric Saxe } 1305*0e751525SEric Saxe } 1306*0e751525SEric Saxe 1307*0e751525SEric Saxe /* ARGSUSED */ 1308*0e751525SEric Saxe static void 1309*0e751525SEric Saxe cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 1310*0e751525SEric Saxe { 1311*0e751525SEric Saxe pg_cmt_t *cmt = (pg_cmt_t *)pg; 1312*0e751525SEric Saxe cpupm_domain_t *dom; 1313*0e751525SEric Saxe 1314*0e751525SEric Saxe dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 1315*0e751525SEric Saxe cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 1316*0e751525SEric Saxe } 1317*0e751525SEric Saxe 1318*0e751525SEric Saxe /* 1319*0e751525SEric Saxe * Return the name of the CMT scheduling policy 1320*0e751525SEric Saxe * being implemented across this PG 1321*0e751525SEric Saxe */ 1322*0e751525SEric Saxe static char * 1323*0e751525SEric Saxe pg_cmt_policy_name(pg_t *pg) 1324*0e751525SEric Saxe { 1325*0e751525SEric Saxe pg_cmt_policy_t policy; 1326*0e751525SEric Saxe 1327*0e751525SEric Saxe policy = ((pg_cmt_t *)pg)->cmt_policy; 1328*0e751525SEric Saxe 1329*0e751525SEric Saxe if (policy & CMT_AFFINITY) { 1330*0e751525SEric Saxe if (policy & CMT_BALANCE) 1331*0e751525SEric Saxe return ("Load Balancing & Affinity"); 1332*0e751525SEric Saxe else if (policy & CMT_COALESCE) 1333*0e751525SEric Saxe return ("Load Coalescence & Affinity"); 13346890d023SEric Saxe else 1335*0e751525SEric Saxe return ("Affinity"); 1336*0e751525SEric Saxe } else { 1337*0e751525SEric Saxe if (policy & CMT_BALANCE) 1338*0e751525SEric Saxe return ("Load Balancing"); 1339*0e751525SEric Saxe else if (policy & CMT_COALESCE) 1340*0e751525SEric Saxe return ("Load Coalescence"); 1341*0e751525SEric Saxe else 1342*0e751525SEric Saxe return ("None"); 1343*0e751525SEric Saxe } 1344*0e751525SEric Saxe } 13456890d023SEric Saxe 13466890d023SEric Saxe /* 1347*0e751525SEric Saxe * Prune PG, and all other instances of PG's hardware sharing relationship 1348*0e751525SEric Saxe * from the PG hierarchy. 13496890d023SEric Saxe */ 1350*0e751525SEric Saxe static int 1351*0e751525SEric Saxe pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz) 1352*0e751525SEric Saxe { 1353*0e751525SEric Saxe group_t *hwset, *children; 1354*0e751525SEric Saxe int i, j, r, size = *sz; 1355*0e751525SEric Saxe group_iter_t hw_iter, child_iter; 1356*0e751525SEric Saxe pg_cpu_itr_t cpu_iter; 1357*0e751525SEric Saxe pg_cmt_t *pg, *child; 1358*0e751525SEric Saxe cpu_t *cpu; 1359*0e751525SEric Saxe int cap_needed; 1360*0e751525SEric Saxe pghw_type_t hw; 13616890d023SEric Saxe 1362*0e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 13636890d023SEric Saxe 1364*0e751525SEric Saxe hw = ((pghw_t *)pg_bad)->pghw_hw; 1365*0e751525SEric Saxe 1366*0e751525SEric Saxe if (hw == PGHW_POW_ACTIVE) { 1367*0e751525SEric Saxe cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 1368*0e751525SEric Saxe "Event Based CPUPM Unavailable"); 1369*0e751525SEric Saxe } else if (hw == PGHW_POW_IDLE) { 1370*0e751525SEric Saxe cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 1371*0e751525SEric Saxe "Dispatcher assisted CPUPM disabled."); 1372*0e751525SEric Saxe } 13736890d023SEric Saxe 13746890d023SEric Saxe /* 1375*0e751525SEric Saxe * Find and eliminate the PG from the lineage. 13766890d023SEric Saxe */ 1377*0e751525SEric Saxe for (i = 0; i < size; i++) { 1378*0e751525SEric Saxe if (lineage[i] == pg_bad) { 1379*0e751525SEric Saxe for (j = i; j < size - 1; j++) 1380*0e751525SEric Saxe lineage[j] = lineage[j + 1]; 1381*0e751525SEric Saxe *sz = size - 1; 1382*0e751525SEric Saxe break; 1383*0e751525SEric Saxe } 1384*0e751525SEric Saxe } 1385*0e751525SEric Saxe 1386*0e751525SEric Saxe /* 1387*0e751525SEric Saxe * We'll prune all instances of the hardware sharing relationship 1388*0e751525SEric Saxe * represented by pg. But before we do that (and pause CPUs) we need 1389*0e751525SEric Saxe * to ensure the hierarchy's groups are properly sized. 1390*0e751525SEric Saxe */ 1391*0e751525SEric Saxe hwset = pghw_set_lookup(hw); 1392*0e751525SEric Saxe 1393*0e751525SEric Saxe /* 1394*0e751525SEric Saxe * Blacklist the hardware so that future groups won't be created. 1395*0e751525SEric Saxe */ 1396*0e751525SEric Saxe cmt_hw_blacklisted[hw] = 1; 1397*0e751525SEric Saxe 1398*0e751525SEric Saxe /* 1399*0e751525SEric Saxe * For each of the PGs being pruned, ensure sufficient capacity in 1400*0e751525SEric Saxe * the siblings set for the PG's children 1401*0e751525SEric Saxe */ 1402*0e751525SEric Saxe group_iter_init(&hw_iter); 1403*0e751525SEric Saxe while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1404*0e751525SEric Saxe /* 1405*0e751525SEric Saxe * PG is being pruned, but if it is bringing up more than 1406*0e751525SEric Saxe * one child, ask for more capacity in the siblings group. 1407*0e751525SEric Saxe */ 1408*0e751525SEric Saxe cap_needed = 0; 1409*0e751525SEric Saxe if (pg->cmt_children && 1410*0e751525SEric Saxe GROUP_SIZE(pg->cmt_children) > 1) { 1411*0e751525SEric Saxe cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 1412*0e751525SEric Saxe 1413*0e751525SEric Saxe group_expand(pg->cmt_siblings, 1414*0e751525SEric Saxe GROUP_SIZE(pg->cmt_siblings) + cap_needed); 1415*0e751525SEric Saxe 1416*0e751525SEric Saxe /* 1417*0e751525SEric Saxe * If this is a top level group, also ensure the 1418*0e751525SEric Saxe * capacity in the root lgrp level CMT grouping. 1419*0e751525SEric Saxe */ 1420*0e751525SEric Saxe if (pg->cmt_parent == NULL && 1421*0e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) { 1422*0e751525SEric Saxe group_expand(&cmt_root->cl_pgs, 1423*0e751525SEric Saxe GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1424*0e751525SEric Saxe } 1425*0e751525SEric Saxe } 1426*0e751525SEric Saxe } 1427*0e751525SEric Saxe 1428*0e751525SEric Saxe /* 1429*0e751525SEric Saxe * We're operating on the PG hierarchy. Pause CPUs to ensure 1430*0e751525SEric Saxe * exclusivity with respect to the dispatcher. 1431*0e751525SEric Saxe */ 1432*0e751525SEric Saxe pause_cpus(NULL); 1433*0e751525SEric Saxe 1434*0e751525SEric Saxe /* 1435*0e751525SEric Saxe * Prune all PG instances of the hardware sharing relationship 1436*0e751525SEric Saxe * represented by pg. 1437*0e751525SEric Saxe */ 1438*0e751525SEric Saxe group_iter_init(&hw_iter); 1439*0e751525SEric Saxe while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 1440*0e751525SEric Saxe 1441*0e751525SEric Saxe /* 1442*0e751525SEric Saxe * Remove PG from it's group of siblings, if it's there. 1443*0e751525SEric Saxe */ 1444*0e751525SEric Saxe if (pg->cmt_siblings) { 1445*0e751525SEric Saxe (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1446*0e751525SEric Saxe } 1447*0e751525SEric Saxe if (pg->cmt_parent == NULL && 1448*0e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) { 1449*0e751525SEric Saxe (void) group_remove(&cmt_root->cl_pgs, pg, 1450*0e751525SEric Saxe GRP_NORESIZE); 1451*0e751525SEric Saxe } 1452*0e751525SEric Saxe /* 1453*0e751525SEric Saxe * Add PGs children to it's group of siblings. 1454*0e751525SEric Saxe */ 1455*0e751525SEric Saxe if (pg->cmt_children != NULL) { 1456*0e751525SEric Saxe children = pg->cmt_children; 1457*0e751525SEric Saxe 1458*0e751525SEric Saxe group_iter_init(&child_iter); 1459*0e751525SEric Saxe while ((child = group_iterate(children, &child_iter)) 1460*0e751525SEric Saxe != NULL) { 1461*0e751525SEric Saxe /* 1462*0e751525SEric Saxe * Transplant child from it's siblings set to 1463*0e751525SEric Saxe * PGs. 1464*0e751525SEric Saxe */ 1465*0e751525SEric Saxe if (pg->cmt_siblings != NULL && 1466*0e751525SEric Saxe child->cmt_siblings != NULL && 1467*0e751525SEric Saxe group_remove(child->cmt_siblings, child, 1468*0e751525SEric Saxe GRP_NORESIZE) != -1) { 1469*0e751525SEric Saxe r = group_add(pg->cmt_siblings, child, 1470*0e751525SEric Saxe GRP_NORESIZE); 1471*0e751525SEric Saxe ASSERT(r == 0); 1472*0e751525SEric Saxe } 1473*0e751525SEric Saxe } 1474*0e751525SEric Saxe } 1475*0e751525SEric Saxe 1476*0e751525SEric Saxe /* 1477*0e751525SEric Saxe * Reset the callbacks to the defaults 1478*0e751525SEric Saxe */ 1479*0e751525SEric Saxe pg_callback_set_defaults((pg_t *)pg); 1480*0e751525SEric Saxe 1481*0e751525SEric Saxe /* 1482*0e751525SEric Saxe * Update all the CPU lineages in each of PG's CPUs 1483*0e751525SEric Saxe */ 1484*0e751525SEric Saxe PG_CPU_ITR_INIT(pg, cpu_iter); 1485*0e751525SEric Saxe while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 1486*0e751525SEric Saxe group_t *pgs; 1487*0e751525SEric Saxe pg_cmt_t *cpu_pg; 1488*0e751525SEric Saxe group_iter_t liter; /* Iterator for the lineage */ 1489*0e751525SEric Saxe 1490*0e751525SEric Saxe /* 1491*0e751525SEric Saxe * Iterate over the CPU's PGs updating the children 1492*0e751525SEric Saxe * of the PG being promoted, since they have a new 1493*0e751525SEric Saxe * parent and siblings set. 1494*0e751525SEric Saxe */ 1495*0e751525SEric Saxe pgs = &cpu->cpu_pg->pgs; 1496*0e751525SEric Saxe group_iter_init(&liter); 1497*0e751525SEric Saxe while ((cpu_pg = group_iterate(pgs, &liter)) != NULL) { 1498*0e751525SEric Saxe if (cpu_pg->cmt_parent == pg) { 1499*0e751525SEric Saxe cpu_pg->cmt_parent = pg->cmt_parent; 1500*0e751525SEric Saxe cpu_pg->cmt_siblings = pg->cmt_siblings; 1501*0e751525SEric Saxe } 1502*0e751525SEric Saxe } 1503*0e751525SEric Saxe 1504*0e751525SEric Saxe /* 1505*0e751525SEric Saxe * Update the CPU's lineages 1506*0e751525SEric Saxe */ 1507*0e751525SEric Saxe pgs = &cpu->cpu_pg->cmt_pgs; 1508*0e751525SEric Saxe (void) group_remove(pgs, pg, GRP_NORESIZE); 1509*0e751525SEric Saxe pgs = &cpu->cpu_pg->pgs; 1510*0e751525SEric Saxe (void) group_remove(pgs, pg, GRP_NORESIZE); 1511*0e751525SEric Saxe } 1512*0e751525SEric Saxe } 1513*0e751525SEric Saxe start_cpus(); 1514*0e751525SEric Saxe return (0); 1515*0e751525SEric Saxe } 1516*0e751525SEric Saxe 1517*0e751525SEric Saxe /* 1518*0e751525SEric Saxe * Disable CMT scheduling 1519*0e751525SEric Saxe */ 1520*0e751525SEric Saxe static void 1521*0e751525SEric Saxe pg_cmt_disable(void) 1522*0e751525SEric Saxe { 1523*0e751525SEric Saxe cpu_t *cpu; 1524*0e751525SEric Saxe 1525*0e751525SEric Saxe pause_cpus(NULL); 1526*0e751525SEric Saxe cpu = cpu_list; 1527*0e751525SEric Saxe 15286890d023SEric Saxe do { 1529*0e751525SEric Saxe if (cpu->cpu_pg) 1530*0e751525SEric Saxe group_empty(&cpu->cpu_pg->cmt_pgs); 1531*0e751525SEric Saxe } while ((cpu = cpu->cpu_next) != cpu_list); 1532*0e751525SEric Saxe 1533*0e751525SEric Saxe cmt_sched_disabled = 1; 1534*0e751525SEric Saxe start_cpus(); 1535*0e751525SEric Saxe cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 1536*0e751525SEric Saxe } 1537*0e751525SEric Saxe 1538*0e751525SEric Saxe static int 1539*0e751525SEric Saxe pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz) 1540*0e751525SEric Saxe { 1541*0e751525SEric Saxe int i, size; 1542*0e751525SEric Saxe pg_cmt_t *pg, *parent, *pg_bad; 1543*0e751525SEric Saxe cpu_t *cp; 1544*0e751525SEric Saxe pg_cpu_itr_t cpu_iter; 1545*0e751525SEric Saxe 1546*0e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 1547*0e751525SEric Saxe 1548*0e751525SEric Saxe revalidate: 1549*0e751525SEric Saxe size = *sz; 1550*0e751525SEric Saxe pg_bad = NULL; 1551*0e751525SEric Saxe for (i = 0; i < size - 1; i++) { 1552*0e751525SEric Saxe 1553*0e751525SEric Saxe pg = lineage[i]; 1554*0e751525SEric Saxe parent = lineage[i + 1]; 15556890d023SEric Saxe 15566890d023SEric Saxe /* 1557*0e751525SEric Saxe * We assume that the lineage has already been sorted 1558*0e751525SEric Saxe * by the number of CPUs. In fact, we depend on it. 15596890d023SEric Saxe */ 1560*0e751525SEric Saxe ASSERT(PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)parent)); 15616890d023SEric Saxe 15626890d023SEric Saxe /* 1563*0e751525SEric Saxe * Walk each of the CPUs in the PGs group, and verify that 1564*0e751525SEric Saxe * the next larger PG contains at least the CPUs in this one. 15656890d023SEric Saxe */ 1566*0e751525SEric Saxe PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 1567*0e751525SEric Saxe while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1568*0e751525SEric Saxe if (pg_cpu_find((pg_t *)parent, cp) == B_FALSE) { 1569*0e751525SEric Saxe cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 1570*0e751525SEric Saxe goto handle_error; 15716890d023SEric Saxe } 1572*0e751525SEric Saxe } 15736890d023SEric Saxe } 15746890d023SEric Saxe 1575*0e751525SEric Saxe handle_error: 1576*0e751525SEric Saxe switch (cmt_lineage_status) { 1577*0e751525SEric Saxe case CMT_LINEAGE_VALID: 1578*0e751525SEric Saxe case CMT_LINEAGE_REPAIRED: 1579*0e751525SEric Saxe break; 1580*0e751525SEric Saxe case CMT_LINEAGE_NON_CONCENTRIC: 15816890d023SEric Saxe /* 1582*0e751525SEric Saxe * We've detected a non-concentric PG lineage. 1583*0e751525SEric Saxe * 1584*0e751525SEric Saxe * This can happen when some of the CPU grouping information 1585*0e751525SEric Saxe * is derived from buggy sources (for example, incorrect ACPI 1586*0e751525SEric Saxe * tables on x86 systems). 1587*0e751525SEric Saxe * 1588*0e751525SEric Saxe * We attempt to recover from this by pruning out the 1589*0e751525SEric Saxe * illegal groupings from the PG hierarchy, which means that 1590*0e751525SEric Saxe * we won't optimize for those levels, but we will for the 1591*0e751525SEric Saxe * remaining ones. 1592*0e751525SEric Saxe * 1593*0e751525SEric Saxe * If a given level has CPUs not found in it's parent, then 1594*0e751525SEric Saxe * we examine the PG and it's parent to see if either grouping 1595*0e751525SEric Saxe * is enumerated from potentially buggy sources. 1596*0e751525SEric Saxe * 1597*0e751525SEric Saxe * If one has less CPUs than the other, and contains CPUs 1598*0e751525SEric Saxe * not found in the parent, and it is an untrusted enumeration, 1599*0e751525SEric Saxe * then prune it. If both have the same number of CPUs, then 1600*0e751525SEric Saxe * prune the one that is untrusted. 1601*0e751525SEric Saxe * 1602*0e751525SEric Saxe * This process repeats until we have a concentric lineage, 1603*0e751525SEric Saxe * or we would have to prune out level derived from what we 1604*0e751525SEric Saxe * thought was a reliable source, in which case CMT scheduling 1605*0e751525SEric Saxe * is disabled all together. 16066890d023SEric Saxe */ 1607*0e751525SEric Saxe if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)parent)) && 1608*0e751525SEric Saxe (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 1609*0e751525SEric Saxe pg_bad = pg; 1610*0e751525SEric Saxe } else if (PG_NUM_CPUS((pg_t *)pg) == 1611*0e751525SEric Saxe PG_NUM_CPUS((pg_t *)parent)) { 1612*0e751525SEric Saxe if (PG_CMT_HW_SUSPECT(((pghw_t *)parent)->pghw_hw)) { 1613*0e751525SEric Saxe pg_bad = parent; 1614*0e751525SEric Saxe } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 1615*0e751525SEric Saxe pg_bad = pg; 16166890d023SEric Saxe } 16176890d023SEric Saxe } 1618*0e751525SEric Saxe if (pg_bad) { 1619*0e751525SEric Saxe if (pg_cmt_prune(pg_bad, lineage, sz) == 0) { 1620*0e751525SEric Saxe cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1621*0e751525SEric Saxe goto revalidate; 1622*0e751525SEric Saxe } 1623*0e751525SEric Saxe } 1624*0e751525SEric Saxe /*FALLTHROUGH*/ 1625*0e751525SEric Saxe default: 1626*0e751525SEric Saxe /* 1627*0e751525SEric Saxe * If we're here, something has gone wrong in trying to 1628*0e751525SEric Saxe * recover from a illegal PG hierarchy, or we've encountered 1629*0e751525SEric Saxe * a validation error for which we don't know how to recover. 1630*0e751525SEric Saxe * In this case, disable CMT scheduling all together. 1631*0e751525SEric Saxe */ 1632*0e751525SEric Saxe pg_cmt_disable(); 1633*0e751525SEric Saxe cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1634*0e751525SEric Saxe return (-1); 1635*0e751525SEric Saxe } 1636*0e751525SEric Saxe return (0); 16376890d023SEric Saxe } 1638