1fb2f18f8Sesaxe /* 2fb2f18f8Sesaxe * CDDL HEADER START 3fb2f18f8Sesaxe * 4fb2f18f8Sesaxe * The contents of this file are subject to the terms of the 5fb2f18f8Sesaxe * Common Development and Distribution License (the "License"). 6fb2f18f8Sesaxe * You may not use this file except in compliance with the License. 7fb2f18f8Sesaxe * 8fb2f18f8Sesaxe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fb2f18f8Sesaxe * or http://www.opensolaris.org/os/licensing. 10fb2f18f8Sesaxe * See the License for the specific language governing permissions 11fb2f18f8Sesaxe * and limitations under the License. 12fb2f18f8Sesaxe * 13fb2f18f8Sesaxe * When distributing Covered Code, include this CDDL HEADER in each 14fb2f18f8Sesaxe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fb2f18f8Sesaxe * If applicable, add the following below this CDDL HEADER, with the 16fb2f18f8Sesaxe * fields enclosed by brackets "[]" replaced with your own identifying 17fb2f18f8Sesaxe * information: Portions Copyright [yyyy] [name of copyright owner] 18fb2f18f8Sesaxe * 19fb2f18f8Sesaxe * CDDL HEADER END 20fb2f18f8Sesaxe */ 21fb2f18f8Sesaxe /* 22d3c97224SAlexander Kolbasov * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 23fb2f18f8Sesaxe */ 24fb2f18f8Sesaxe 25fb2f18f8Sesaxe #include <sys/systm.h> 26fb2f18f8Sesaxe #include <sys/types.h> 27fb2f18f8Sesaxe #include <sys/param.h> 28fb2f18f8Sesaxe #include <sys/thread.h> 29fb2f18f8Sesaxe #include <sys/cpuvar.h> 30fb2f18f8Sesaxe #include <sys/cpupart.h> 31fb2f18f8Sesaxe #include <sys/kmem.h> 32fb2f18f8Sesaxe #include <sys/cmn_err.h> 33fb2f18f8Sesaxe #include <sys/kstat.h> 34fb2f18f8Sesaxe #include <sys/processor.h> 35fb2f18f8Sesaxe #include <sys/disp.h> 36fb2f18f8Sesaxe #include <sys/group.h> 37fb2f18f8Sesaxe #include <sys/pghw.h> 38fb2f18f8Sesaxe #include <sys/bitset.h> 39fb2f18f8Sesaxe #include <sys/lgrp.h> 40fb2f18f8Sesaxe #include <sys/cmt.h> 410e751525SEric Saxe #include <sys/cpu_pm.h> 42fb2f18f8Sesaxe 43fb2f18f8Sesaxe /* 44fb2f18f8Sesaxe * CMT scheduler / dispatcher support 45fb2f18f8Sesaxe * 46fb2f18f8Sesaxe * This file implements CMT scheduler support using Processor Groups. 47fb2f18f8Sesaxe * The CMT processor group class creates and maintains the CMT class 48fb2f18f8Sesaxe * specific processor group pg_cmt_t. 49fb2f18f8Sesaxe * 50fb2f18f8Sesaxe * ---------------------------- <-- pg_cmt_t * 51fb2f18f8Sesaxe * | pghw_t | 52fb2f18f8Sesaxe * ---------------------------- 53fb2f18f8Sesaxe * | CMT class specific data | 54fb2f18f8Sesaxe * | - hierarchy linkage | 55fb2f18f8Sesaxe * | - CMT load balancing data| 56fb2f18f8Sesaxe * | - active CPU group/bitset| 57fb2f18f8Sesaxe * ---------------------------- 58fb2f18f8Sesaxe * 59fb2f18f8Sesaxe * The scheduler/dispatcher leverages knowledge of the performance 60fb2f18f8Sesaxe * relevant CMT sharing relationships existing between cpus to implement 610e751525SEric Saxe * optimized affinity, load balancing, and coalescence policies. 62fb2f18f8Sesaxe * 63fb2f18f8Sesaxe * Load balancing policy seeks to improve performance by minimizing 640e751525SEric Saxe * contention over shared processor resources / facilities, Affinity 650e751525SEric Saxe * policies seek to improve cache and TLB utilization. Coalescence 660e751525SEric Saxe * policies improve resource utilization and ultimately power efficiency. 67fb2f18f8Sesaxe * 68fb2f18f8Sesaxe * The CMT PGs created by this class are already arranged into a 69fb2f18f8Sesaxe * hierarchy (which is done in the pghw layer). To implement the top-down 70fb2f18f8Sesaxe * CMT load balancing algorithm, the CMT PGs additionally maintain 71fb2f18f8Sesaxe * parent, child and sibling hierarchy relationships. 72fb2f18f8Sesaxe * Parent PGs always contain a superset of their children(s) resources, 73fb2f18f8Sesaxe * each PG can have at most one parent, and siblings are the group of PGs 74fb2f18f8Sesaxe * sharing the same parent. 75fb2f18f8Sesaxe * 76d0e93b69SEric Saxe * On UMA based systems, the CMT load balancing algorithm begins by balancing 77d0e93b69SEric Saxe * load across the group of top level PGs in the system hierarchy. 78d0e93b69SEric Saxe * On NUMA systems, the CMT load balancing algorithm balances load across the 79d0e93b69SEric Saxe * group of top level PGs in each leaf lgroup...but for root homed threads, 80d0e93b69SEric Saxe * is willing to balance against all the top level PGs in the system. 81d0e93b69SEric Saxe * 82d0e93b69SEric Saxe * Groups of top level PGs are maintained to implement the above, one for each 83d0e93b69SEric Saxe * leaf lgroup (containing the top level PGs in that lgroup), and one (for the 84d0e93b69SEric Saxe * root lgroup) that contains all the top level PGs in the system. 85fb2f18f8Sesaxe */ 86a6604450Sesaxe static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 87a6604450Sesaxe static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 88a6604450Sesaxe /* used for null_proc_lpa */ 890e751525SEric Saxe cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 90fb2f18f8Sesaxe 91a6604450Sesaxe static int is_cpu0 = 1; /* true if this is boot CPU context */ 92a6604450Sesaxe 93a6604450Sesaxe /* 940e751525SEric Saxe * Array of hardware sharing relationships that are blacklisted. 95d0e93b69SEric Saxe * CMT scheduling optimizations won't be performed for blacklisted sharing 96d0e93b69SEric Saxe * relationships. 970e751525SEric Saxe */ 980e751525SEric Saxe static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS]; 990e751525SEric Saxe 1000e751525SEric Saxe /* 101a6604450Sesaxe * Set this to non-zero to disable CMT scheduling 102a6604450Sesaxe * This must be done via kmdb -d, as /etc/system will be too late 103a6604450Sesaxe */ 1040e751525SEric Saxe int cmt_sched_disabled = 0; 105fb2f18f8Sesaxe 106ef4f35d8SEric Saxe /* 107ef4f35d8SEric Saxe * Status codes for CMT lineage validation 108ef4f35d8SEric Saxe * See pg_cmt_lineage_validate() below 109ef4f35d8SEric Saxe */ 110ef4f35d8SEric Saxe typedef enum cmt_lineage_validation { 111ef4f35d8SEric Saxe CMT_LINEAGE_VALID, 112ef4f35d8SEric Saxe CMT_LINEAGE_NON_CONCENTRIC, 113ef4f35d8SEric Saxe CMT_LINEAGE_PG_SPANS_LGRPS, 114ef4f35d8SEric Saxe CMT_LINEAGE_NON_PROMOTABLE, 115ef4f35d8SEric Saxe CMT_LINEAGE_REPAIRED, 116ef4f35d8SEric Saxe CMT_LINEAGE_UNRECOVERABLE 117ef4f35d8SEric Saxe } cmt_lineage_validation_t; 118ef4f35d8SEric Saxe 119ef4f35d8SEric Saxe /* 120ef4f35d8SEric Saxe * Status of the current lineage under construction. 121ef4f35d8SEric Saxe * One must be holding cpu_lock to change this. 122ef4f35d8SEric Saxe */ 123ef4f35d8SEric Saxe cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID; 124ef4f35d8SEric Saxe 125ef4f35d8SEric Saxe /* 126ef4f35d8SEric Saxe * Power domain definitions (on x86) are defined by ACPI, and 127ef4f35d8SEric Saxe * therefore may be subject to BIOS bugs. 128ef4f35d8SEric Saxe */ 129ef4f35d8SEric Saxe #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw) 130ef4f35d8SEric Saxe 131ef4f35d8SEric Saxe /* 132ef4f35d8SEric Saxe * Macro to test if PG is managed by the CMT PG class 133ef4f35d8SEric Saxe */ 134ef4f35d8SEric Saxe #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 135ef4f35d8SEric Saxe 136fb2f18f8Sesaxe static pg_cid_t pg_cmt_class_id; /* PG class id */ 137fb2f18f8Sesaxe 138fb2f18f8Sesaxe static pg_t *pg_cmt_alloc(); 139fb2f18f8Sesaxe static void pg_cmt_free(pg_t *); 14047ab0c7cSEric Saxe static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *); 14147ab0c7cSEric Saxe static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *); 142fb2f18f8Sesaxe static void pg_cmt_cpu_active(cpu_t *); 143fb2f18f8Sesaxe static void pg_cmt_cpu_inactive(cpu_t *); 144fb2f18f8Sesaxe static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 145fb2f18f8Sesaxe static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 1460e751525SEric Saxe static char *pg_cmt_policy_name(pg_t *); 1470e751525SEric Saxe static void pg_cmt_hier_sort(pg_cmt_t **, int); 1480e751525SEric Saxe static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *); 149fb2f18f8Sesaxe static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 150fb2f18f8Sesaxe static int pg_cmt_hw(pghw_type_t); 151fb2f18f8Sesaxe static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 152a6604450Sesaxe static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 1530e751525SEric Saxe static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t, 1540e751525SEric Saxe kthread_t *, kthread_t *); 1550e751525SEric Saxe static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t, 1560e751525SEric Saxe kthread_t *, kthread_t *); 1570e751525SEric Saxe static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *); 1581a77c24bSEric Saxe static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *, 1591a77c24bSEric Saxe cpu_pg_t *); 160fb2f18f8Sesaxe 1610e751525SEric Saxe /* 162fb2f18f8Sesaxe * CMT PG ops 163fb2f18f8Sesaxe */ 164fb2f18f8Sesaxe struct pg_ops pg_ops_cmt = { 165fb2f18f8Sesaxe pg_cmt_alloc, 166fb2f18f8Sesaxe pg_cmt_free, 167fb2f18f8Sesaxe pg_cmt_cpu_init, 168fb2f18f8Sesaxe pg_cmt_cpu_fini, 169fb2f18f8Sesaxe pg_cmt_cpu_active, 170fb2f18f8Sesaxe pg_cmt_cpu_inactive, 171fb2f18f8Sesaxe pg_cmt_cpupart_in, 172fb2f18f8Sesaxe NULL, /* cpupart_out */ 173fb2f18f8Sesaxe pg_cmt_cpupart_move, 174fb2f18f8Sesaxe pg_cmt_cpu_belongs, 1750e751525SEric Saxe pg_cmt_policy_name, 176fb2f18f8Sesaxe }; 177fb2f18f8Sesaxe 178fb2f18f8Sesaxe /* 179fb2f18f8Sesaxe * Initialize the CMT PG class 180fb2f18f8Sesaxe */ 181fb2f18f8Sesaxe void 182fb2f18f8Sesaxe pg_cmt_class_init(void) 183fb2f18f8Sesaxe { 184fb2f18f8Sesaxe if (cmt_sched_disabled) 185fb2f18f8Sesaxe return; 186fb2f18f8Sesaxe 187fb2f18f8Sesaxe pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 188fb2f18f8Sesaxe } 189fb2f18f8Sesaxe 190fb2f18f8Sesaxe /* 191fb2f18f8Sesaxe * Called to indicate a new CPU has started up so 192fb2f18f8Sesaxe * that either t0 or the slave startup thread can 193fb2f18f8Sesaxe * be accounted for. 194fb2f18f8Sesaxe */ 195fb2f18f8Sesaxe void 196fb2f18f8Sesaxe pg_cmt_cpu_startup(cpu_t *cp) 197fb2f18f8Sesaxe { 1980e751525SEric Saxe pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread, 1990e751525SEric Saxe cp->cpu_thread); 200fb2f18f8Sesaxe } 201fb2f18f8Sesaxe 202fb2f18f8Sesaxe /* 203fb2f18f8Sesaxe * Return non-zero if thread can migrate between "from" and "to" 204fb2f18f8Sesaxe * without a performance penalty 205fb2f18f8Sesaxe */ 206fb2f18f8Sesaxe int 207fb2f18f8Sesaxe pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 208fb2f18f8Sesaxe { 209fb2f18f8Sesaxe if (from->cpu_physid->cpu_cacheid == 210fb2f18f8Sesaxe to->cpu_physid->cpu_cacheid) 211fb2f18f8Sesaxe return (1); 212fb2f18f8Sesaxe return (0); 213fb2f18f8Sesaxe } 214fb2f18f8Sesaxe 215fb2f18f8Sesaxe /* 216fb2f18f8Sesaxe * CMT class specific PG allocation 217fb2f18f8Sesaxe */ 218fb2f18f8Sesaxe static pg_t * 219fb2f18f8Sesaxe pg_cmt_alloc(void) 220fb2f18f8Sesaxe { 221fb2f18f8Sesaxe return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 222fb2f18f8Sesaxe } 223fb2f18f8Sesaxe 224fb2f18f8Sesaxe /* 225fb2f18f8Sesaxe * Class specific PG de-allocation 226fb2f18f8Sesaxe */ 227fb2f18f8Sesaxe static void 228fb2f18f8Sesaxe pg_cmt_free(pg_t *pg) 229fb2f18f8Sesaxe { 230fb2f18f8Sesaxe ASSERT(pg != NULL); 231fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg)); 232fb2f18f8Sesaxe 233fb2f18f8Sesaxe kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 234fb2f18f8Sesaxe } 235fb2f18f8Sesaxe 236fb2f18f8Sesaxe /* 2370e751525SEric Saxe * Given a hardware sharing relationship, return which dispatcher 2380e751525SEric Saxe * policies should be implemented to optimize performance and efficiency 239fb2f18f8Sesaxe */ 2400e751525SEric Saxe static pg_cmt_policy_t 2410e751525SEric Saxe pg_cmt_policy(pghw_type_t hw) 242fb2f18f8Sesaxe { 2430e751525SEric Saxe pg_cmt_policy_t p; 2440e751525SEric Saxe 2450e751525SEric Saxe /* 2460e751525SEric Saxe * Give the platform a chance to override the default 2470e751525SEric Saxe */ 2480e751525SEric Saxe if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY) 2490e751525SEric Saxe return (p); 2500e751525SEric Saxe 2510e751525SEric Saxe switch (hw) { 2520e751525SEric Saxe case PGHW_IPIPE: 2530e751525SEric Saxe case PGHW_FPU: 2548031591dSSrihari Venkatesan case PGHW_PROCNODE: 2550e751525SEric Saxe case PGHW_CHIP: 2560e751525SEric Saxe return (CMT_BALANCE); 2570e751525SEric Saxe case PGHW_CACHE: 258d3c97224SAlexander Kolbasov return (CMT_AFFINITY | CMT_BALANCE); 2590e751525SEric Saxe case PGHW_POW_ACTIVE: 2600e751525SEric Saxe case PGHW_POW_IDLE: 2610e751525SEric Saxe return (CMT_BALANCE); 2620e751525SEric Saxe default: 2630e751525SEric Saxe return (CMT_NO_POLICY); 2640e751525SEric Saxe } 2650e751525SEric Saxe } 2660e751525SEric Saxe 2670e751525SEric Saxe /* 2680e751525SEric Saxe * Rank the importance of optimizing for the pg1 relationship vs. 2690e751525SEric Saxe * the pg2 relationship. 2700e751525SEric Saxe */ 2710e751525SEric Saxe static pg_cmt_t * 2720e751525SEric Saxe pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2) 2730e751525SEric Saxe { 2740e751525SEric Saxe pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw; 2750e751525SEric Saxe pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw; 2760e751525SEric Saxe 2770e751525SEric Saxe /* 2780e751525SEric Saxe * A power domain is only important if CPUPM is enabled. 2790e751525SEric Saxe */ 2800e751525SEric Saxe if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) { 2810e751525SEric Saxe if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2)) 2820e751525SEric Saxe return (pg2); 2830e751525SEric Saxe if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1)) 2840e751525SEric Saxe return (pg1); 2850e751525SEric Saxe } 2860e751525SEric Saxe 2870e751525SEric Saxe /* 2880e751525SEric Saxe * Otherwise, ask the platform 2890e751525SEric Saxe */ 2900e751525SEric Saxe if (pg_plat_hw_rank(hw1, hw2) == hw1) 2910e751525SEric Saxe return (pg1); 2920e751525SEric Saxe else 2930e751525SEric Saxe return (pg2); 2940e751525SEric Saxe } 2950e751525SEric Saxe 2960e751525SEric Saxe /* 2970e751525SEric Saxe * Initialize CMT callbacks for the given PG 2980e751525SEric Saxe */ 2990e751525SEric Saxe static void 3000e751525SEric Saxe cmt_callback_init(pg_t *pg) 3010e751525SEric Saxe { 302d0e93b69SEric Saxe /* 303d0e93b69SEric Saxe * Stick with the default callbacks if there isn't going to be 304d0e93b69SEric Saxe * any CMT thread placement optimizations implemented. 305d0e93b69SEric Saxe */ 306d0e93b69SEric Saxe if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY) 307d0e93b69SEric Saxe return; 308d0e93b69SEric Saxe 3090e751525SEric Saxe switch (((pghw_t *)pg)->pghw_hw) { 3100e751525SEric Saxe case PGHW_POW_ACTIVE: 3110e751525SEric Saxe pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr; 3120e751525SEric Saxe pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr; 3130e751525SEric Saxe break; 3140e751525SEric Saxe default: 3150e751525SEric Saxe pg->pg_cb.thread_swtch = cmt_ev_thread_swtch; 3160e751525SEric Saxe 3170e751525SEric Saxe } 3180e751525SEric Saxe } 3190e751525SEric Saxe 3200e751525SEric Saxe /* 3210e751525SEric Saxe * Promote PG above it's current parent. 3221a77c24bSEric Saxe * This is only legal if PG has an equal or greater number of CPUs than its 3231a77c24bSEric Saxe * parent. 3241a77c24bSEric Saxe * 3251a77c24bSEric Saxe * This routine operates on the CPU specific processor group data (for the CPUs 3261a77c24bSEric Saxe * in the PG being promoted), and may be invoked from a context where one CPU's 3271a77c24bSEric Saxe * PG data is under construction. In this case the argument "pgdata", if not 3281a77c24bSEric Saxe * NULL, is a reference to the CPU's under-construction PG data. 3290e751525SEric Saxe */ 3300e751525SEric Saxe static void 3311a77c24bSEric Saxe cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata) 3320e751525SEric Saxe { 3330e751525SEric Saxe pg_cmt_t *parent; 3340e751525SEric Saxe group_t *children; 3350e751525SEric Saxe cpu_t *cpu; 3360e751525SEric Saxe group_iter_t iter; 3370e751525SEric Saxe pg_cpu_itr_t cpu_iter; 3380e751525SEric Saxe int r; 3390e751525SEric Saxe int err; 340b025faeeSEric Saxe int nchildren; 3410e751525SEric Saxe 3420e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 3430e751525SEric Saxe 3440e751525SEric Saxe parent = pg->cmt_parent; 3450e751525SEric Saxe if (parent == NULL) { 3460e751525SEric Saxe /* 3470e751525SEric Saxe * Nothing to do 3480e751525SEric Saxe */ 3490e751525SEric Saxe return; 3500e751525SEric Saxe } 3510e751525SEric Saxe 3520e751525SEric Saxe ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent)); 3530e751525SEric Saxe 3540e751525SEric Saxe /* 3550e751525SEric Saxe * We're changing around the hierarchy, which is actively traversed 3560e751525SEric Saxe * by the dispatcher. Pause CPUS to ensure exclusivity. 3570e751525SEric Saxe */ 358*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL); 3590e751525SEric Saxe 3600e751525SEric Saxe /* 3610e751525SEric Saxe * If necessary, update the parent's sibling set, replacing parent 3620e751525SEric Saxe * with PG. 3630e751525SEric Saxe */ 3640e751525SEric Saxe if (parent->cmt_siblings) { 3650e751525SEric Saxe if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE) 3660e751525SEric Saxe != -1) { 3670e751525SEric Saxe r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE); 3680e751525SEric Saxe ASSERT(r != -1); 3690e751525SEric Saxe } 3700e751525SEric Saxe } 3710e751525SEric Saxe 3720e751525SEric Saxe /* 3730e751525SEric Saxe * If the parent is at the top of the hierarchy, replace it's entry 3740e751525SEric Saxe * in the root lgroup's group of top level PGs. 3750e751525SEric Saxe */ 3760e751525SEric Saxe if (parent->cmt_parent == NULL && 3770e751525SEric Saxe parent->cmt_siblings != &cmt_root->cl_pgs) { 3780e751525SEric Saxe if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE) 3790e751525SEric Saxe != -1) { 3800e751525SEric Saxe r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE); 3810e751525SEric Saxe ASSERT(r != -1); 3820e751525SEric Saxe } 3830e751525SEric Saxe } 3840e751525SEric Saxe 3850e751525SEric Saxe /* 3860e751525SEric Saxe * We assume (and therefore assert) that the PG being promoted is an 3870e751525SEric Saxe * only child of it's parent. Update the parent's children set 3880e751525SEric Saxe * replacing PG's entry with the parent (since the parent is becoming 389b025faeeSEric Saxe * the child). Then have PG and the parent swap children sets and 390b025faeeSEric Saxe * children counts. 3910e751525SEric Saxe */ 3920e751525SEric Saxe ASSERT(GROUP_SIZE(parent->cmt_children) <= 1); 3930e751525SEric Saxe if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) { 3940e751525SEric Saxe r = group_add(parent->cmt_children, parent, GRP_NORESIZE); 3950e751525SEric Saxe ASSERT(r != -1); 3960e751525SEric Saxe } 3970e751525SEric Saxe 3980e751525SEric Saxe children = pg->cmt_children; 3990e751525SEric Saxe pg->cmt_children = parent->cmt_children; 4000e751525SEric Saxe parent->cmt_children = children; 4010e751525SEric Saxe 402b025faeeSEric Saxe nchildren = pg->cmt_nchildren; 403b025faeeSEric Saxe pg->cmt_nchildren = parent->cmt_nchildren; 404b025faeeSEric Saxe parent->cmt_nchildren = nchildren; 405b025faeeSEric Saxe 4060e751525SEric Saxe /* 4070e751525SEric Saxe * Update the sibling references for PG and it's parent 4080e751525SEric Saxe */ 4090e751525SEric Saxe pg->cmt_siblings = parent->cmt_siblings; 4100e751525SEric Saxe parent->cmt_siblings = pg->cmt_children; 4110e751525SEric Saxe 4120e751525SEric Saxe /* 4130e751525SEric Saxe * Update any cached lineages in the per CPU pg data. 4140e751525SEric Saxe */ 4150e751525SEric Saxe PG_CPU_ITR_INIT(pg, cpu_iter); 4160e751525SEric Saxe while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 4170e751525SEric Saxe int idx; 418b025faeeSEric Saxe int sz; 4190e751525SEric Saxe pg_cmt_t *cpu_pg; 4201a77c24bSEric Saxe cpu_pg_t *pgd; /* CPU's PG data */ 4211a77c24bSEric Saxe 4221a77c24bSEric Saxe /* 4231a77c24bSEric Saxe * The CPU's whose lineage is under construction still 4241a77c24bSEric Saxe * references the bootstrap CPU PG data structure. 4251a77c24bSEric Saxe */ 4261a77c24bSEric Saxe if (pg_cpu_is_bootstrapped(cpu)) 4271a77c24bSEric Saxe pgd = pgdata; 4281a77c24bSEric Saxe else 4291a77c24bSEric Saxe pgd = cpu->cpu_pg; 4300e751525SEric Saxe 4310e751525SEric Saxe /* 4320e751525SEric Saxe * Iterate over the CPU's PGs updating the children 4330e751525SEric Saxe * of the PG being promoted, since they have a new parent. 4340e751525SEric Saxe */ 4350e751525SEric Saxe group_iter_init(&iter); 4361a77c24bSEric Saxe while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) { 4370e751525SEric Saxe if (cpu_pg->cmt_parent == pg) { 4380e751525SEric Saxe cpu_pg->cmt_parent = parent; 4390e751525SEric Saxe } 4400e751525SEric Saxe } 4410e751525SEric Saxe 4420e751525SEric Saxe /* 4430e751525SEric Saxe * Update the CMT load balancing lineage 4440e751525SEric Saxe */ 4451a77c24bSEric Saxe if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) { 4460e751525SEric Saxe /* 4470e751525SEric Saxe * Unless this is the CPU who's lineage is being 4480e751525SEric Saxe * constructed, the PG being promoted should be 4490e751525SEric Saxe * in the lineage. 4500e751525SEric Saxe */ 4511a77c24bSEric Saxe ASSERT(pg_cpu_is_bootstrapped(cpu)); 4520e751525SEric Saxe continue; 4530e751525SEric Saxe } 4540e751525SEric Saxe 4550e751525SEric Saxe ASSERT(idx > 0); 456b025faeeSEric Saxe ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent); 4570e751525SEric Saxe 4580e751525SEric Saxe /* 4590e751525SEric Saxe * Have the child and the parent swap places in the CPU's 4600e751525SEric Saxe * lineage 4610e751525SEric Saxe */ 4621a77c24bSEric Saxe group_remove_at(&pgd->cmt_pgs, idx); 4631a77c24bSEric Saxe group_remove_at(&pgd->cmt_pgs, idx - 1); 4641a77c24bSEric Saxe err = group_add_at(&pgd->cmt_pgs, parent, idx); 4650e751525SEric Saxe ASSERT(err == 0); 4661a77c24bSEric Saxe err = group_add_at(&pgd->cmt_pgs, pg, idx - 1); 4670e751525SEric Saxe ASSERT(err == 0); 468b025faeeSEric Saxe 469b025faeeSEric Saxe /* 470b025faeeSEric Saxe * Ensure cmt_lineage references CPU's leaf PG. 471b025faeeSEric Saxe * Since cmt_pgs is top-down ordered, the bottom is the last 472b025faeeSEric Saxe * element. 473b025faeeSEric Saxe */ 474b025faeeSEric Saxe if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0) 475b025faeeSEric Saxe pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1); 4760e751525SEric Saxe } 4770e751525SEric Saxe 4780e751525SEric Saxe /* 4790e751525SEric Saxe * Update the parent references for PG and it's parent 4800e751525SEric Saxe */ 4810e751525SEric Saxe pg->cmt_parent = parent->cmt_parent; 4820e751525SEric Saxe parent->cmt_parent = pg; 4830e751525SEric Saxe 4840e751525SEric Saxe start_cpus(); 485fb2f18f8Sesaxe } 486fb2f18f8Sesaxe 487fb2f18f8Sesaxe /* 488fb2f18f8Sesaxe * CMT class callback for a new CPU entering the system 4891a77c24bSEric Saxe * 4901a77c24bSEric Saxe * This routine operates on the CPU specific processor group data (for the CPU 4911a77c24bSEric Saxe * being initialized). The argument "pgdata" is a reference to the CPU's PG 4921a77c24bSEric Saxe * data to be constructed. 4931a77c24bSEric Saxe * 4941a77c24bSEric Saxe * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 4951a77c24bSEric Saxe * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it 4961a77c24bSEric Saxe * calls must be careful to operate only on the "pgdata" argument, and not 4971a77c24bSEric Saxe * cp->cpu_pg. 498fb2f18f8Sesaxe */ 499fb2f18f8Sesaxe static void 5001a77c24bSEric Saxe pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata) 501fb2f18f8Sesaxe { 502fb2f18f8Sesaxe pg_cmt_t *pg; 503fb2f18f8Sesaxe group_t *cmt_pgs; 5040e751525SEric Saxe int levels, level; 505fb2f18f8Sesaxe pghw_type_t hw; 506fb2f18f8Sesaxe pg_t *pg_cache = NULL; 507fb2f18f8Sesaxe pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 508fb2f18f8Sesaxe lgrp_handle_t lgrp_handle; 509fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 510ef4f35d8SEric Saxe cmt_lineage_validation_t lineage_status; 511fb2f18f8Sesaxe 512fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 5131a77c24bSEric Saxe ASSERT(pg_cpu_is_bootstrapped(cp)); 514fb2f18f8Sesaxe 5150e751525SEric Saxe if (cmt_sched_disabled) 5160e751525SEric Saxe return; 5170e751525SEric Saxe 518fb2f18f8Sesaxe /* 519fb2f18f8Sesaxe * A new CPU is coming into the system. 520fb2f18f8Sesaxe * Interrogate the platform to see if the CPU 5210e751525SEric Saxe * has any performance or efficiency relevant 5220e751525SEric Saxe * sharing relationships 523fb2f18f8Sesaxe */ 5241a77c24bSEric Saxe cmt_pgs = &pgdata->cmt_pgs; 5251a77c24bSEric Saxe pgdata->cmt_lineage = NULL; 526fb2f18f8Sesaxe 527fb2f18f8Sesaxe bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 5280e751525SEric Saxe levels = 0; 529fb2f18f8Sesaxe for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 530fb2f18f8Sesaxe 5310e751525SEric Saxe pg_cmt_policy_t policy; 5320e751525SEric Saxe 533fb2f18f8Sesaxe /* 5340e751525SEric Saxe * We're only interested in the hw sharing relationships 5350e751525SEric Saxe * for which we know how to optimize. 536fb2f18f8Sesaxe */ 5370e751525SEric Saxe policy = pg_cmt_policy(hw); 5380e751525SEric Saxe if (policy == CMT_NO_POLICY || 5390e751525SEric Saxe pg_plat_hw_shared(cp, hw) == 0) 540fb2f18f8Sesaxe continue; 541fb2f18f8Sesaxe 542fb2f18f8Sesaxe /* 543d0e93b69SEric Saxe * We will still create the PGs for hardware sharing 544d0e93b69SEric Saxe * relationships that have been blacklisted, but won't 545d0e93b69SEric Saxe * implement CMT thread placement optimizations against them. 5460e751525SEric Saxe */ 547d0e93b69SEric Saxe if (cmt_hw_blacklisted[hw] == 1) 548d0e93b69SEric Saxe policy = CMT_NO_POLICY; 5490e751525SEric Saxe 5500e751525SEric Saxe /* 551fb2f18f8Sesaxe * Find (or create) the PG associated with 552fb2f18f8Sesaxe * the hw sharing relationship in which cp 553fb2f18f8Sesaxe * belongs. 554fb2f18f8Sesaxe * 555fb2f18f8Sesaxe * Determine if a suitable PG already 556fb2f18f8Sesaxe * exists, or if one needs to be created. 557fb2f18f8Sesaxe */ 558fb2f18f8Sesaxe pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 559fb2f18f8Sesaxe if (pg == NULL) { 560fb2f18f8Sesaxe /* 561fb2f18f8Sesaxe * Create a new one. 562fb2f18f8Sesaxe * Initialize the common... 563fb2f18f8Sesaxe */ 564fb2f18f8Sesaxe pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 565fb2f18f8Sesaxe 566fb2f18f8Sesaxe /* ... physical ... */ 567fb2f18f8Sesaxe pghw_init((pghw_t *)pg, cp, hw); 568fb2f18f8Sesaxe 569fb2f18f8Sesaxe /* 570fb2f18f8Sesaxe * ... and CMT specific portions of the 571fb2f18f8Sesaxe * structure. 572fb2f18f8Sesaxe */ 5730e751525SEric Saxe pg->cmt_policy = policy; 5740e751525SEric Saxe 5750e751525SEric Saxe /* CMT event callbacks */ 5760e751525SEric Saxe cmt_callback_init((pg_t *)pg); 5770e751525SEric Saxe 578fb2f18f8Sesaxe bitset_init(&pg->cmt_cpus_actv_set); 579fb2f18f8Sesaxe group_create(&pg->cmt_cpus_actv); 580fb2f18f8Sesaxe } else { 581fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg)); 582fb2f18f8Sesaxe } 583fb2f18f8Sesaxe 584b885580bSAlexander Kolbasov ((pghw_t *)pg)->pghw_generation++; 585b885580bSAlexander Kolbasov 586fb2f18f8Sesaxe /* Add the CPU to the PG */ 5871a77c24bSEric Saxe pg_cpu_add((pg_t *)pg, cp, pgdata); 588fb2f18f8Sesaxe 589fb2f18f8Sesaxe /* 5906890d023SEric Saxe * Ensure capacity of the active CPU group/bitset 591fb2f18f8Sesaxe */ 592fb2f18f8Sesaxe group_expand(&pg->cmt_cpus_actv, 593fb2f18f8Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 594fb2f18f8Sesaxe 595fb2f18f8Sesaxe if (cp->cpu_seqid >= 596fb2f18f8Sesaxe bitset_capacity(&pg->cmt_cpus_actv_set)) { 597fb2f18f8Sesaxe bitset_resize(&pg->cmt_cpus_actv_set, 598fb2f18f8Sesaxe cp->cpu_seqid + 1); 599fb2f18f8Sesaxe } 600fb2f18f8Sesaxe 601fb2f18f8Sesaxe /* 6020e751525SEric Saxe * Build a lineage of CMT PGs for load balancing / coalescence 603fb2f18f8Sesaxe */ 6040e751525SEric Saxe if (policy & (CMT_BALANCE | CMT_COALESCE)) { 6050e751525SEric Saxe cpu_cmt_hier[levels++] = pg; 606fb2f18f8Sesaxe } 607fb2f18f8Sesaxe 608fb2f18f8Sesaxe /* Cache this for later */ 609fb2f18f8Sesaxe if (hw == PGHW_CACHE) 610fb2f18f8Sesaxe pg_cache = (pg_t *)pg; 611fb2f18f8Sesaxe } 612fb2f18f8Sesaxe 6130e751525SEric Saxe group_expand(cmt_pgs, levels); 6146890d023SEric Saxe 6156890d023SEric Saxe if (cmt_root == NULL) 6166890d023SEric Saxe cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 617fb2f18f8Sesaxe 618fb2f18f8Sesaxe /* 6190e751525SEric Saxe * Find the lgrp that encapsulates this CPU's CMT hierarchy 6206890d023SEric Saxe */ 6216890d023SEric Saxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 6226890d023SEric Saxe if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 6236890d023SEric Saxe lgrp = pg_cmt_lgrp_create(lgrp_handle); 6246890d023SEric Saxe 6256890d023SEric Saxe /* 6260e751525SEric Saxe * Ascendingly sort the PGs in the lineage by number of CPUs 6270e751525SEric Saxe */ 6280e751525SEric Saxe pg_cmt_hier_sort(cpu_cmt_hier, levels); 6290e751525SEric Saxe 6300e751525SEric Saxe /* 6310e751525SEric Saxe * Examine the lineage and validate it. 6320e751525SEric Saxe * This routine will also try to fix the lineage along with the 6330e751525SEric Saxe * rest of the PG hierarchy should it detect an issue. 6340e751525SEric Saxe * 635ef4f35d8SEric Saxe * If it returns anything other than VALID or REPAIRED, an 636ef4f35d8SEric Saxe * unrecoverable error has occurred, and we cannot proceed. 6370e751525SEric Saxe */ 6381a77c24bSEric Saxe lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata); 639ef4f35d8SEric Saxe if ((lineage_status != CMT_LINEAGE_VALID) && 6401a77c24bSEric Saxe (lineage_status != CMT_LINEAGE_REPAIRED)) { 6411a77c24bSEric Saxe /* 6421a77c24bSEric Saxe * In the case of an unrecoverable error where CMT scheduling 6431a77c24bSEric Saxe * has been disabled, assert that the under construction CPU's 6441a77c24bSEric Saxe * PG data has an empty CMT load balancing lineage. 6451a77c24bSEric Saxe */ 6461a77c24bSEric Saxe ASSERT((cmt_sched_disabled == 0) || 6471a77c24bSEric Saxe (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0)); 6480e751525SEric Saxe return; 6491a77c24bSEric Saxe } 6500e751525SEric Saxe 6510e751525SEric Saxe /* 6520e751525SEric Saxe * For existing PGs in the lineage, verify that the parent is 6530e751525SEric Saxe * correct, as the generation in the lineage may have changed 6540e751525SEric Saxe * as a result of the sorting. Start the traversal at the top 6550e751525SEric Saxe * of the lineage, moving down. 6560e751525SEric Saxe */ 6570e751525SEric Saxe for (level = levels - 1; level >= 0; ) { 6580e751525SEric Saxe int reorg; 6590e751525SEric Saxe 6600e751525SEric Saxe reorg = 0; 6610e751525SEric Saxe pg = cpu_cmt_hier[level]; 6620e751525SEric Saxe 6630e751525SEric Saxe /* 6640e751525SEric Saxe * Promote PGs at an incorrect generation into place. 6650e751525SEric Saxe */ 6660e751525SEric Saxe while (pg->cmt_parent && 6670e751525SEric Saxe pg->cmt_parent != cpu_cmt_hier[level + 1]) { 6681a77c24bSEric Saxe cmt_hier_promote(pg, pgdata); 6690e751525SEric Saxe reorg++; 6700e751525SEric Saxe } 6710e751525SEric Saxe if (reorg > 0) 6720e751525SEric Saxe level = levels - 1; 6730e751525SEric Saxe else 6740e751525SEric Saxe level--; 6750e751525SEric Saxe } 6760e751525SEric Saxe 6770e751525SEric Saxe /* 6786890d023SEric Saxe * For each of the PGs in the CPU's lineage: 6790e751525SEric Saxe * - Add an entry in the CPU sorted CMT PG group 6800e751525SEric Saxe * which is used for top down CMT load balancing 681fb2f18f8Sesaxe * - Tie the PG into the CMT hierarchy by connecting 682fb2f18f8Sesaxe * it to it's parent and siblings. 683fb2f18f8Sesaxe */ 6840e751525SEric Saxe for (level = 0; level < levels; level++) { 685fb2f18f8Sesaxe uint_t children; 686fb2f18f8Sesaxe int err; 687fb2f18f8Sesaxe 688fb2f18f8Sesaxe pg = cpu_cmt_hier[level]; 6890e751525SEric Saxe err = group_add_at(cmt_pgs, pg, levels - level - 1); 690fb2f18f8Sesaxe ASSERT(err == 0); 691fb2f18f8Sesaxe 692fb2f18f8Sesaxe if (level == 0) 6931a77c24bSEric Saxe pgdata->cmt_lineage = (pg_t *)pg; 694fb2f18f8Sesaxe 695fb2f18f8Sesaxe if (pg->cmt_siblings != NULL) { 696fb2f18f8Sesaxe /* Already initialized */ 697fb2f18f8Sesaxe ASSERT(pg->cmt_parent == NULL || 698fb2f18f8Sesaxe pg->cmt_parent == cpu_cmt_hier[level + 1]); 699fb2f18f8Sesaxe ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 700c416da2dSjb145095 ((pg->cmt_parent != NULL) && 701c416da2dSjb145095 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 702fb2f18f8Sesaxe continue; 703fb2f18f8Sesaxe } 704fb2f18f8Sesaxe 7050e751525SEric Saxe if ((level + 1) == levels) { 706fb2f18f8Sesaxe pg->cmt_parent = NULL; 7076890d023SEric Saxe 708fb2f18f8Sesaxe pg->cmt_siblings = &lgrp->cl_pgs; 709fb2f18f8Sesaxe children = ++lgrp->cl_npgs; 7100e751525SEric Saxe if (cmt_root != lgrp) 7116890d023SEric Saxe cmt_root->cl_npgs++; 712fb2f18f8Sesaxe } else { 713fb2f18f8Sesaxe pg->cmt_parent = cpu_cmt_hier[level + 1]; 714fb2f18f8Sesaxe 715fb2f18f8Sesaxe /* 716fb2f18f8Sesaxe * A good parent keeps track of their children. 717fb2f18f8Sesaxe * The parent's children group is also the PG's 718fb2f18f8Sesaxe * siblings. 719fb2f18f8Sesaxe */ 720fb2f18f8Sesaxe if (pg->cmt_parent->cmt_children == NULL) { 721fb2f18f8Sesaxe pg->cmt_parent->cmt_children = 722fb2f18f8Sesaxe kmem_zalloc(sizeof (group_t), KM_SLEEP); 723fb2f18f8Sesaxe group_create(pg->cmt_parent->cmt_children); 724fb2f18f8Sesaxe } 725fb2f18f8Sesaxe pg->cmt_siblings = pg->cmt_parent->cmt_children; 726fb2f18f8Sesaxe children = ++pg->cmt_parent->cmt_nchildren; 727fb2f18f8Sesaxe } 7286890d023SEric Saxe 729fb2f18f8Sesaxe group_expand(pg->cmt_siblings, children); 7306890d023SEric Saxe group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 731fb2f18f8Sesaxe } 732fb2f18f8Sesaxe 733fb2f18f8Sesaxe /* 734fb2f18f8Sesaxe * Cache the chip and core IDs in the cpu_t->cpu_physid structure 735fb2f18f8Sesaxe * for fast lookups later. 736fb2f18f8Sesaxe */ 737fb2f18f8Sesaxe if (cp->cpu_physid) { 738fb2f18f8Sesaxe cp->cpu_physid->cpu_chipid = 739fb2f18f8Sesaxe pg_plat_hw_instance_id(cp, PGHW_CHIP); 740fb2f18f8Sesaxe cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 741fb2f18f8Sesaxe 742fb2f18f8Sesaxe /* 743fb2f18f8Sesaxe * If this cpu has a PG representing shared cache, then set 744fb2f18f8Sesaxe * cpu_cacheid to that PG's logical id 745fb2f18f8Sesaxe */ 746fb2f18f8Sesaxe if (pg_cache) 747fb2f18f8Sesaxe cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 748fb2f18f8Sesaxe } 749fb2f18f8Sesaxe 750fb2f18f8Sesaxe /* CPU0 only initialization */ 751fb2f18f8Sesaxe if (is_cpu0) { 752fb2f18f8Sesaxe is_cpu0 = 0; 753a6604450Sesaxe cpu0_lgrp = lgrp; 754fb2f18f8Sesaxe } 755fb2f18f8Sesaxe 756fb2f18f8Sesaxe } 757fb2f18f8Sesaxe 758fb2f18f8Sesaxe /* 759fb2f18f8Sesaxe * Class callback when a CPU is leaving the system (deletion) 7601a77c24bSEric Saxe * 7611a77c24bSEric Saxe * "pgdata" is a reference to the CPU's PG data to be deconstructed. 7621a77c24bSEric Saxe * 7631a77c24bSEric Saxe * cp->cpu_pg is used by the dispatcher to access the CPU's PG data 7641a77c24bSEric Saxe * references a "bootstrap" structure across this function's invocation. 765b885580bSAlexander Kolbasov * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only 7661a77c24bSEric Saxe * on the "pgdata" argument, and not cp->cpu_pg. 767fb2f18f8Sesaxe */ 768fb2f18f8Sesaxe static void 7691a77c24bSEric Saxe pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata) 770fb2f18f8Sesaxe { 771fb2f18f8Sesaxe group_iter_t i; 772fb2f18f8Sesaxe pg_cmt_t *pg; 773fb2f18f8Sesaxe group_t *pgs, *cmt_pgs; 774fb2f18f8Sesaxe lgrp_handle_t lgrp_handle; 775fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 776fb2f18f8Sesaxe 7770e751525SEric Saxe if (cmt_sched_disabled) 7780e751525SEric Saxe return; 7790e751525SEric Saxe 7801a77c24bSEric Saxe ASSERT(pg_cpu_is_bootstrapped(cp)); 7811a77c24bSEric Saxe 7821a77c24bSEric Saxe pgs = &pgdata->pgs; 7831a77c24bSEric Saxe cmt_pgs = &pgdata->cmt_pgs; 784fb2f18f8Sesaxe 785fb2f18f8Sesaxe /* 786fb2f18f8Sesaxe * Find the lgroup that encapsulates this CPU's CMT hierarchy 787fb2f18f8Sesaxe */ 788fb2f18f8Sesaxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 789a6604450Sesaxe 790fb2f18f8Sesaxe lgrp = pg_cmt_find_lgrp(lgrp_handle); 7913e81cacfSEric Saxe if (ncpus == 1 && lgrp != cpu0_lgrp) { 792a6604450Sesaxe /* 7933e81cacfSEric Saxe * One might wonder how we could be deconfiguring the 7943e81cacfSEric Saxe * only CPU in the system. 795a6604450Sesaxe * 7963e81cacfSEric Saxe * On Starcat systems when null_proc_lpa is detected, 7973e81cacfSEric Saxe * the boot CPU (which is already configured into a leaf 7983e81cacfSEric Saxe * lgroup), is moved into the root lgroup. This is done by 7993e81cacfSEric Saxe * deconfiguring it from both lgroups and processor 8003e81cacfSEric Saxe * groups), and then later reconfiguring it back in. This 8013e81cacfSEric Saxe * call to pg_cmt_cpu_fini() is part of that deconfiguration. 8023e81cacfSEric Saxe * 8033e81cacfSEric Saxe * This special case is detected by noting that the platform 8043e81cacfSEric Saxe * has changed the CPU's lgrp affiliation (since it now 8053e81cacfSEric Saxe * belongs in the root). In this case, use the cmt_lgrp_t 8063e81cacfSEric Saxe * cached for the boot CPU, since this is what needs to be 8073e81cacfSEric Saxe * torn down. 808a6604450Sesaxe */ 809a6604450Sesaxe lgrp = cpu0_lgrp; 810a6604450Sesaxe } 811fb2f18f8Sesaxe 8123e81cacfSEric Saxe ASSERT(lgrp != NULL); 8133e81cacfSEric Saxe 814fb2f18f8Sesaxe /* 815fb2f18f8Sesaxe * First, clean up anything load balancing specific for each of 816fb2f18f8Sesaxe * the CPU's PGs that participated in CMT load balancing 817fb2f18f8Sesaxe */ 8181a77c24bSEric Saxe pg = (pg_cmt_t *)pgdata->cmt_lineage; 819fb2f18f8Sesaxe while (pg != NULL) { 820fb2f18f8Sesaxe 821b885580bSAlexander Kolbasov ((pghw_t *)pg)->pghw_generation++; 822b885580bSAlexander Kolbasov 823fb2f18f8Sesaxe /* 824fb2f18f8Sesaxe * Remove the PG from the CPU's load balancing lineage 825fb2f18f8Sesaxe */ 826fb2f18f8Sesaxe (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 827fb2f18f8Sesaxe 828fb2f18f8Sesaxe /* 829fb2f18f8Sesaxe * If it's about to become empty, destroy it's children 830fb2f18f8Sesaxe * group, and remove it's reference from it's siblings. 831fb2f18f8Sesaxe * This is done here (rather than below) to avoid removing 832fb2f18f8Sesaxe * our reference from a PG that we just eliminated. 833fb2f18f8Sesaxe */ 834fb2f18f8Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 835fb2f18f8Sesaxe if (pg->cmt_children != NULL) 836fb2f18f8Sesaxe group_destroy(pg->cmt_children); 837fb2f18f8Sesaxe if (pg->cmt_siblings != NULL) { 838fb2f18f8Sesaxe if (pg->cmt_siblings == &lgrp->cl_pgs) 839fb2f18f8Sesaxe lgrp->cl_npgs--; 840fb2f18f8Sesaxe else 841fb2f18f8Sesaxe pg->cmt_parent->cmt_nchildren--; 842fb2f18f8Sesaxe } 843fb2f18f8Sesaxe } 844fb2f18f8Sesaxe pg = pg->cmt_parent; 845fb2f18f8Sesaxe } 846fb2f18f8Sesaxe ASSERT(GROUP_SIZE(cmt_pgs) == 0); 847fb2f18f8Sesaxe 848fb2f18f8Sesaxe /* 849fb2f18f8Sesaxe * Now that the load balancing lineage updates have happened, 850fb2f18f8Sesaxe * remove the CPU from all it's PGs (destroying any that become 851fb2f18f8Sesaxe * empty). 852fb2f18f8Sesaxe */ 853fb2f18f8Sesaxe group_iter_init(&i); 854fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 855fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 856fb2f18f8Sesaxe continue; 857fb2f18f8Sesaxe 8581a77c24bSEric Saxe pg_cpu_delete((pg_t *)pg, cp, pgdata); 859fb2f18f8Sesaxe /* 860fb2f18f8Sesaxe * Deleting the CPU from the PG changes the CPU's 861fb2f18f8Sesaxe * PG group over which we are actively iterating 862fb2f18f8Sesaxe * Re-initialize the iteration 863fb2f18f8Sesaxe */ 864fb2f18f8Sesaxe group_iter_init(&i); 865fb2f18f8Sesaxe 866fb2f18f8Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 867fb2f18f8Sesaxe 868fb2f18f8Sesaxe /* 869fb2f18f8Sesaxe * The PG has become zero sized, so destroy it. 870fb2f18f8Sesaxe */ 871fb2f18f8Sesaxe group_destroy(&pg->cmt_cpus_actv); 872fb2f18f8Sesaxe bitset_fini(&pg->cmt_cpus_actv_set); 873fb2f18f8Sesaxe pghw_fini((pghw_t *)pg); 874fb2f18f8Sesaxe 875fb2f18f8Sesaxe pg_destroy((pg_t *)pg); 876fb2f18f8Sesaxe } 877fb2f18f8Sesaxe } 878fb2f18f8Sesaxe } 879fb2f18f8Sesaxe 880fb2f18f8Sesaxe /* 881fb2f18f8Sesaxe * Class callback when a CPU is entering a cpu partition 882fb2f18f8Sesaxe */ 883fb2f18f8Sesaxe static void 884fb2f18f8Sesaxe pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 885fb2f18f8Sesaxe { 886fb2f18f8Sesaxe group_t *pgs; 887fb2f18f8Sesaxe pg_t *pg; 888fb2f18f8Sesaxe group_iter_t i; 889fb2f18f8Sesaxe 890fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 891fb2f18f8Sesaxe 8920e751525SEric Saxe if (cmt_sched_disabled) 8930e751525SEric Saxe return; 8940e751525SEric Saxe 895fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 896fb2f18f8Sesaxe 897fb2f18f8Sesaxe /* 898fb2f18f8Sesaxe * Ensure that the new partition's PG bitset 899fb2f18f8Sesaxe * is large enough for all CMT PG's to which cp 900fb2f18f8Sesaxe * belongs 901fb2f18f8Sesaxe */ 902fb2f18f8Sesaxe group_iter_init(&i); 903fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 904fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 905fb2f18f8Sesaxe continue; 906fb2f18f8Sesaxe 907fb2f18f8Sesaxe if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 908fb2f18f8Sesaxe bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 909fb2f18f8Sesaxe } 910fb2f18f8Sesaxe } 911fb2f18f8Sesaxe 912fb2f18f8Sesaxe /* 913fb2f18f8Sesaxe * Class callback when a CPU is actually moving partitions 914fb2f18f8Sesaxe */ 915fb2f18f8Sesaxe static void 916fb2f18f8Sesaxe pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 917fb2f18f8Sesaxe { 918fb2f18f8Sesaxe cpu_t *cpp; 919fb2f18f8Sesaxe group_t *pgs; 920fb2f18f8Sesaxe pg_t *pg; 921fb2f18f8Sesaxe group_iter_t pg_iter; 922fb2f18f8Sesaxe pg_cpu_itr_t cpu_iter; 923fb2f18f8Sesaxe boolean_t found; 924fb2f18f8Sesaxe 925fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 926fb2f18f8Sesaxe 9270e751525SEric Saxe if (cmt_sched_disabled) 9280e751525SEric Saxe return; 9290e751525SEric Saxe 930fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 931fb2f18f8Sesaxe group_iter_init(&pg_iter); 932fb2f18f8Sesaxe 933fb2f18f8Sesaxe /* 934fb2f18f8Sesaxe * Iterate over the CPUs CMT PGs 935fb2f18f8Sesaxe */ 936fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 937fb2f18f8Sesaxe 938fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 939fb2f18f8Sesaxe continue; 940fb2f18f8Sesaxe 941fb2f18f8Sesaxe /* 942fb2f18f8Sesaxe * Add the PG to the bitset in the new partition. 943fb2f18f8Sesaxe */ 944fb2f18f8Sesaxe bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 945fb2f18f8Sesaxe 946fb2f18f8Sesaxe /* 947fb2f18f8Sesaxe * Remove the PG from the bitset in the old partition 948fb2f18f8Sesaxe * if the last of the PG's CPUs have left. 949fb2f18f8Sesaxe */ 950fb2f18f8Sesaxe found = B_FALSE; 951fb2f18f8Sesaxe PG_CPU_ITR_INIT(pg, cpu_iter); 952fb2f18f8Sesaxe while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 953fb2f18f8Sesaxe if (cpp == cp) 954fb2f18f8Sesaxe continue; 955a6604450Sesaxe if (CPU_ACTIVE(cpp) && 956a6604450Sesaxe cpp->cpu_part->cp_id == oldpp->cp_id) { 957fb2f18f8Sesaxe found = B_TRUE; 958fb2f18f8Sesaxe break; 959fb2f18f8Sesaxe } 960fb2f18f8Sesaxe } 961fb2f18f8Sesaxe if (!found) 962fb2f18f8Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 963fb2f18f8Sesaxe } 964fb2f18f8Sesaxe } 965fb2f18f8Sesaxe 966fb2f18f8Sesaxe /* 967fb2f18f8Sesaxe * Class callback when a CPU becomes active (online) 968fb2f18f8Sesaxe * 969fb2f18f8Sesaxe * This is called in a context where CPUs are paused 970fb2f18f8Sesaxe */ 971fb2f18f8Sesaxe static void 972fb2f18f8Sesaxe pg_cmt_cpu_active(cpu_t *cp) 973fb2f18f8Sesaxe { 974fb2f18f8Sesaxe int err; 975fb2f18f8Sesaxe group_iter_t i; 976fb2f18f8Sesaxe pg_cmt_t *pg; 977fb2f18f8Sesaxe group_t *pgs; 978fb2f18f8Sesaxe 979fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 980fb2f18f8Sesaxe 9810e751525SEric Saxe if (cmt_sched_disabled) 9820e751525SEric Saxe return; 9830e751525SEric Saxe 984fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 985fb2f18f8Sesaxe group_iter_init(&i); 986fb2f18f8Sesaxe 987fb2f18f8Sesaxe /* 988fb2f18f8Sesaxe * Iterate over the CPU's PGs 989fb2f18f8Sesaxe */ 990fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 991fb2f18f8Sesaxe 992fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 993fb2f18f8Sesaxe continue; 994fb2f18f8Sesaxe 995b885580bSAlexander Kolbasov /* 996b885580bSAlexander Kolbasov * Move to the next generation since topology is changing 997b885580bSAlexander Kolbasov */ 998b885580bSAlexander Kolbasov ((pghw_t *)pg)->pghw_generation++; 999b885580bSAlexander Kolbasov 1000fb2f18f8Sesaxe err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1001fb2f18f8Sesaxe ASSERT(err == 0); 1002fb2f18f8Sesaxe 1003fb2f18f8Sesaxe /* 1004fb2f18f8Sesaxe * If this is the first active CPU in the PG, and it 1005fb2f18f8Sesaxe * represents a hardware sharing relationship over which 1006fb2f18f8Sesaxe * CMT load balancing is performed, add it as a candidate 1007fb2f18f8Sesaxe * for balancing with it's siblings. 1008fb2f18f8Sesaxe */ 1009fb2f18f8Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 10100e751525SEric Saxe (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1011fb2f18f8Sesaxe err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 1012fb2f18f8Sesaxe ASSERT(err == 0); 10136890d023SEric Saxe 10146890d023SEric Saxe /* 10156890d023SEric Saxe * If this is a top level PG, add it as a balancing 10160e751525SEric Saxe * candidate when balancing within the root lgroup. 10176890d023SEric Saxe */ 10180e751525SEric Saxe if (pg->cmt_parent == NULL && 10190e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) { 10206890d023SEric Saxe err = group_add(&cmt_root->cl_pgs, pg, 10216890d023SEric Saxe GRP_NORESIZE); 10226890d023SEric Saxe ASSERT(err == 0); 10236890d023SEric Saxe } 1024fb2f18f8Sesaxe } 1025fb2f18f8Sesaxe 1026fb2f18f8Sesaxe /* 1027fb2f18f8Sesaxe * Notate the CPU in the PGs active CPU bitset. 1028fb2f18f8Sesaxe * Also notate the PG as being active in it's associated 1029fb2f18f8Sesaxe * partition 1030fb2f18f8Sesaxe */ 1031fb2f18f8Sesaxe bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1032fb2f18f8Sesaxe bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 1033fb2f18f8Sesaxe } 1034fb2f18f8Sesaxe } 1035fb2f18f8Sesaxe 1036fb2f18f8Sesaxe /* 1037fb2f18f8Sesaxe * Class callback when a CPU goes inactive (offline) 1038fb2f18f8Sesaxe * 1039fb2f18f8Sesaxe * This is called in a context where CPUs are paused 1040fb2f18f8Sesaxe */ 1041fb2f18f8Sesaxe static void 1042fb2f18f8Sesaxe pg_cmt_cpu_inactive(cpu_t *cp) 1043fb2f18f8Sesaxe { 1044fb2f18f8Sesaxe int err; 1045fb2f18f8Sesaxe group_t *pgs; 1046fb2f18f8Sesaxe pg_cmt_t *pg; 1047fb2f18f8Sesaxe cpu_t *cpp; 1048fb2f18f8Sesaxe group_iter_t i; 1049fb2f18f8Sesaxe pg_cpu_itr_t cpu_itr; 1050fb2f18f8Sesaxe boolean_t found; 1051fb2f18f8Sesaxe 1052fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 1053fb2f18f8Sesaxe 10540e751525SEric Saxe if (cmt_sched_disabled) 10550e751525SEric Saxe return; 10560e751525SEric Saxe 1057fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 1058fb2f18f8Sesaxe group_iter_init(&i); 1059fb2f18f8Sesaxe 1060fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 1061fb2f18f8Sesaxe 1062fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 1063fb2f18f8Sesaxe continue; 1064fb2f18f8Sesaxe 1065fb2f18f8Sesaxe /* 1066b885580bSAlexander Kolbasov * Move to the next generation since topology is changing 1067b885580bSAlexander Kolbasov */ 1068b885580bSAlexander Kolbasov ((pghw_t *)pg)->pghw_generation++; 1069b885580bSAlexander Kolbasov 1070b885580bSAlexander Kolbasov /* 1071fb2f18f8Sesaxe * Remove the CPU from the CMT PGs active CPU group 1072fb2f18f8Sesaxe * bitmap 1073fb2f18f8Sesaxe */ 1074fb2f18f8Sesaxe err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 1075fb2f18f8Sesaxe ASSERT(err == 0); 1076fb2f18f8Sesaxe 1077fb2f18f8Sesaxe bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 1078fb2f18f8Sesaxe 1079fb2f18f8Sesaxe /* 1080fb2f18f8Sesaxe * If there are no more active CPUs in this PG over which 1081fb2f18f8Sesaxe * load was balanced, remove it as a balancing candidate. 1082fb2f18f8Sesaxe */ 1083fb2f18f8Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 10840e751525SEric Saxe (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) { 1085fb2f18f8Sesaxe err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 1086fb2f18f8Sesaxe ASSERT(err == 0); 10876890d023SEric Saxe 10880e751525SEric Saxe if (pg->cmt_parent == NULL && 10890e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) { 10906890d023SEric Saxe err = group_remove(&cmt_root->cl_pgs, pg, 10916890d023SEric Saxe GRP_NORESIZE); 10926890d023SEric Saxe ASSERT(err == 0); 10936890d023SEric Saxe } 1094fb2f18f8Sesaxe } 1095fb2f18f8Sesaxe 1096fb2f18f8Sesaxe /* 1097fb2f18f8Sesaxe * Assert the number of active CPUs does not exceed 1098fb2f18f8Sesaxe * the total number of CPUs in the PG 1099fb2f18f8Sesaxe */ 1100fb2f18f8Sesaxe ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 1101fb2f18f8Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 1102fb2f18f8Sesaxe 1103fb2f18f8Sesaxe /* 1104fb2f18f8Sesaxe * Update the PG bitset in the CPU's old partition 1105fb2f18f8Sesaxe */ 1106fb2f18f8Sesaxe found = B_FALSE; 1107fb2f18f8Sesaxe PG_CPU_ITR_INIT(pg, cpu_itr); 1108fb2f18f8Sesaxe while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 1109fb2f18f8Sesaxe if (cpp == cp) 1110fb2f18f8Sesaxe continue; 1111a6604450Sesaxe if (CPU_ACTIVE(cpp) && 1112a6604450Sesaxe cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 1113fb2f18f8Sesaxe found = B_TRUE; 1114fb2f18f8Sesaxe break; 1115fb2f18f8Sesaxe } 1116fb2f18f8Sesaxe } 1117fb2f18f8Sesaxe if (!found) { 1118fb2f18f8Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs, 1119fb2f18f8Sesaxe ((pg_t *)pg)->pg_id); 1120fb2f18f8Sesaxe } 1121fb2f18f8Sesaxe } 1122fb2f18f8Sesaxe } 1123fb2f18f8Sesaxe 1124fb2f18f8Sesaxe /* 1125fb2f18f8Sesaxe * Return non-zero if the CPU belongs in the given PG 1126fb2f18f8Sesaxe */ 1127fb2f18f8Sesaxe static int 1128fb2f18f8Sesaxe pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 1129fb2f18f8Sesaxe { 1130fb2f18f8Sesaxe cpu_t *pg_cpu; 1131fb2f18f8Sesaxe 1132fb2f18f8Sesaxe pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 1133fb2f18f8Sesaxe 1134fb2f18f8Sesaxe ASSERT(pg_cpu != NULL); 1135fb2f18f8Sesaxe 1136fb2f18f8Sesaxe /* 1137fb2f18f8Sesaxe * The CPU belongs if, given the nature of the hardware sharing 1138fb2f18f8Sesaxe * relationship represented by the PG, the CPU has that 1139fb2f18f8Sesaxe * relationship with some other CPU already in the PG 1140fb2f18f8Sesaxe */ 1141fb2f18f8Sesaxe if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 1142fb2f18f8Sesaxe return (1); 1143fb2f18f8Sesaxe 1144fb2f18f8Sesaxe return (0); 1145fb2f18f8Sesaxe } 1146fb2f18f8Sesaxe 1147fb2f18f8Sesaxe /* 11480e751525SEric Saxe * Sort the CPUs CMT hierarchy, where "size" is the number of levels. 1149fb2f18f8Sesaxe */ 1150fb2f18f8Sesaxe static void 11510e751525SEric Saxe pg_cmt_hier_sort(pg_cmt_t **hier, int size) 1152fb2f18f8Sesaxe { 11538031591dSSrihari Venkatesan int i, j, inc, sz; 11548031591dSSrihari Venkatesan int start, end; 11550e751525SEric Saxe pg_t *tmp; 11560e751525SEric Saxe pg_t **h = (pg_t **)hier; 1157fb2f18f8Sesaxe 11580e751525SEric Saxe /* 11590e751525SEric Saxe * First sort by number of CPUs 11600e751525SEric Saxe */ 11610e751525SEric Saxe inc = size / 2; 11620e751525SEric Saxe while (inc > 0) { 11630e751525SEric Saxe for (i = inc; i < size; i++) { 11640e751525SEric Saxe j = i; 11650e751525SEric Saxe tmp = h[i]; 11660e751525SEric Saxe while ((j >= inc) && 11670e751525SEric Saxe (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) { 11680e751525SEric Saxe h[j] = h[j - inc]; 11690e751525SEric Saxe j = j - inc; 11700e751525SEric Saxe } 11710e751525SEric Saxe h[j] = tmp; 11720e751525SEric Saxe } 11730e751525SEric Saxe if (inc == 2) 11740e751525SEric Saxe inc = 1; 11750e751525SEric Saxe else 11760e751525SEric Saxe inc = (inc * 5) / 11; 11770e751525SEric Saxe } 1178fb2f18f8Sesaxe 11790e751525SEric Saxe /* 11800e751525SEric Saxe * Break ties by asking the platform. 11810e751525SEric Saxe * Determine if h[i] outranks h[i + 1] and if so, swap them. 11820e751525SEric Saxe */ 11838031591dSSrihari Venkatesan for (start = 0; start < size; start++) { 11848031591dSSrihari Venkatesan 11858031591dSSrihari Venkatesan /* 11868031591dSSrihari Venkatesan * Find various contiguous sets of elements, 11878031591dSSrihari Venkatesan * in the array, with the same number of cpus 11888031591dSSrihari Venkatesan */ 11898031591dSSrihari Venkatesan end = start; 11908031591dSSrihari Venkatesan sz = PG_NUM_CPUS(h[start]); 11918031591dSSrihari Venkatesan while ((end < size) && (sz == PG_NUM_CPUS(h[end]))) 11928031591dSSrihari Venkatesan end++; 11938031591dSSrihari Venkatesan /* 11948031591dSSrihari Venkatesan * Sort each such set of the array by rank 11958031591dSSrihari Venkatesan */ 11968031591dSSrihari Venkatesan for (i = start + 1; i < end; i++) { 11978031591dSSrihari Venkatesan j = i - 1; 11980e751525SEric Saxe tmp = h[i]; 11998031591dSSrihari Venkatesan while (j >= start && 12008031591dSSrihari Venkatesan pg_cmt_hier_rank(hier[j], 12018031591dSSrihari Venkatesan (pg_cmt_t *)tmp) == hier[j]) { 12028031591dSSrihari Venkatesan h[j + 1] = h[j]; 12038031591dSSrihari Venkatesan j--; 12048031591dSSrihari Venkatesan } 12058031591dSSrihari Venkatesan h[j + 1] = tmp; 1206fb2f18f8Sesaxe } 1207fb2f18f8Sesaxe } 1208fb2f18f8Sesaxe } 1209fb2f18f8Sesaxe 1210fb2f18f8Sesaxe /* 1211fb2f18f8Sesaxe * Return a cmt_lgrp_t * given an lgroup handle. 1212fb2f18f8Sesaxe */ 1213fb2f18f8Sesaxe static cmt_lgrp_t * 1214fb2f18f8Sesaxe pg_cmt_find_lgrp(lgrp_handle_t hand) 1215fb2f18f8Sesaxe { 1216fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 1217fb2f18f8Sesaxe 1218fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 1219fb2f18f8Sesaxe 1220fb2f18f8Sesaxe lgrp = cmt_lgrps; 1221fb2f18f8Sesaxe while (lgrp != NULL) { 1222fb2f18f8Sesaxe if (lgrp->cl_hand == hand) 1223a6604450Sesaxe break; 1224fb2f18f8Sesaxe lgrp = lgrp->cl_next; 1225fb2f18f8Sesaxe } 1226a6604450Sesaxe return (lgrp); 1227a6604450Sesaxe } 1228fb2f18f8Sesaxe 1229fb2f18f8Sesaxe /* 1230a6604450Sesaxe * Create a cmt_lgrp_t with the specified handle. 1231fb2f18f8Sesaxe */ 1232a6604450Sesaxe static cmt_lgrp_t * 1233a6604450Sesaxe pg_cmt_lgrp_create(lgrp_handle_t hand) 1234a6604450Sesaxe { 1235a6604450Sesaxe cmt_lgrp_t *lgrp; 1236a6604450Sesaxe 1237a6604450Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 1238a6604450Sesaxe 1239fb2f18f8Sesaxe lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 1240fb2f18f8Sesaxe 1241fb2f18f8Sesaxe lgrp->cl_hand = hand; 1242fb2f18f8Sesaxe lgrp->cl_npgs = 0; 1243fb2f18f8Sesaxe lgrp->cl_next = cmt_lgrps; 1244fb2f18f8Sesaxe cmt_lgrps = lgrp; 1245fb2f18f8Sesaxe group_create(&lgrp->cl_pgs); 1246fb2f18f8Sesaxe 1247fb2f18f8Sesaxe return (lgrp); 1248fb2f18f8Sesaxe } 12496890d023SEric Saxe 12506890d023SEric Saxe /* 12510e751525SEric Saxe * Interfaces to enable and disable power aware dispatching 12520e751525SEric Saxe * The caller must be holding cpu_lock. 12536890d023SEric Saxe * 12540e751525SEric Saxe * Return 0 on success and -1 on failure. 12556890d023SEric Saxe */ 12560e751525SEric Saxe int 12570e751525SEric Saxe cmt_pad_enable(pghw_type_t type) 12586890d023SEric Saxe { 12590e751525SEric Saxe group_t *hwset; 12600e751525SEric Saxe group_iter_t iter; 12610e751525SEric Saxe pg_cmt_t *pg; 12626890d023SEric Saxe 12630e751525SEric Saxe ASSERT(PGHW_IS_PM_DOMAIN(type)); 12640e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 12656890d023SEric Saxe 1266040ea03aSRichard Lowe if (cmt_sched_disabled == 1) 1267040ea03aSRichard Lowe return (-1); 1268040ea03aSRichard Lowe 12690e751525SEric Saxe if ((hwset = pghw_set_lookup(type)) == NULL || 12700e751525SEric Saxe cmt_hw_blacklisted[type]) { 12710e751525SEric Saxe /* 12720e751525SEric Saxe * Unable to find any instances of the specified type 12730e751525SEric Saxe * of power domain, or the power domains have been blacklisted. 12740e751525SEric Saxe */ 12750e751525SEric Saxe return (-1); 12760e751525SEric Saxe } 12776890d023SEric Saxe 12786890d023SEric Saxe /* 12790e751525SEric Saxe * Iterate over the power domains, setting the default dispatcher 12800e751525SEric Saxe * policy for power/performance optimization. 12810e751525SEric Saxe * 12820e751525SEric Saxe * Simply setting the policy isn't enough in the case where the power 12830e751525SEric Saxe * domain is an only child of another PG. Because the dispatcher walks 12840e751525SEric Saxe * the PG hierarchy in a top down fashion, the higher up PG's policy 12850e751525SEric Saxe * will dominate. So promote the power domain above it's parent if both 12860e751525SEric Saxe * PG and it's parent have the same CPUs to ensure it's policy 12870e751525SEric Saxe * dominates. 12886890d023SEric Saxe */ 12890e751525SEric Saxe group_iter_init(&iter); 12900e751525SEric Saxe while ((pg = group_iterate(hwset, &iter)) != NULL) { 12910e751525SEric Saxe /* 12920e751525SEric Saxe * If the power domain is an only child to a parent 12930e751525SEric Saxe * not implementing the same policy, promote the child 12940e751525SEric Saxe * above the parent to activate the policy. 12950e751525SEric Saxe */ 12960e751525SEric Saxe pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw); 12970e751525SEric Saxe while ((pg->cmt_parent != NULL) && 12980e751525SEric Saxe (pg->cmt_parent->cmt_policy != pg->cmt_policy) && 12990e751525SEric Saxe (PG_NUM_CPUS((pg_t *)pg) == 13000e751525SEric Saxe PG_NUM_CPUS((pg_t *)pg->cmt_parent))) { 13011a77c24bSEric Saxe cmt_hier_promote(pg, NULL); 13020e751525SEric Saxe } 13030e751525SEric Saxe } 13040e751525SEric Saxe 13050e751525SEric Saxe return (0); 13060e751525SEric Saxe } 13070e751525SEric Saxe 13080e751525SEric Saxe int 13090e751525SEric Saxe cmt_pad_disable(pghw_type_t type) 13100e751525SEric Saxe { 13110e751525SEric Saxe group_t *hwset; 13120e751525SEric Saxe group_iter_t iter; 13130e751525SEric Saxe pg_cmt_t *pg; 13140e751525SEric Saxe pg_cmt_t *child; 13150e751525SEric Saxe 13160e751525SEric Saxe ASSERT(PGHW_IS_PM_DOMAIN(type)); 13170e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 13180e751525SEric Saxe 1319040ea03aSRichard Lowe if (cmt_sched_disabled == 1) 1320040ea03aSRichard Lowe return (-1); 1321040ea03aSRichard Lowe 13220e751525SEric Saxe if ((hwset = pghw_set_lookup(type)) == NULL) { 13230e751525SEric Saxe /* 13240e751525SEric Saxe * Unable to find any instances of the specified type of 13250e751525SEric Saxe * power domain. 13260e751525SEric Saxe */ 13270e751525SEric Saxe return (-1); 13280e751525SEric Saxe } 13290e751525SEric Saxe /* 13300e751525SEric Saxe * Iterate over the power domains, setting the default dispatcher 13310e751525SEric Saxe * policy for performance optimization (load balancing). 13320e751525SEric Saxe */ 13330e751525SEric Saxe group_iter_init(&iter); 13340e751525SEric Saxe while ((pg = group_iterate(hwset, &iter)) != NULL) { 13350e751525SEric Saxe 13360e751525SEric Saxe /* 13370e751525SEric Saxe * If the power domain has an only child that implements 13380e751525SEric Saxe * policy other than load balancing, promote the child 13390e751525SEric Saxe * above the power domain to ensure it's policy dominates. 13400e751525SEric Saxe */ 1341f03808b6SEric Saxe if (pg->cmt_children != NULL && 1342f03808b6SEric Saxe GROUP_SIZE(pg->cmt_children) == 1) { 13430e751525SEric Saxe child = GROUP_ACCESS(pg->cmt_children, 0); 13440e751525SEric Saxe if ((child->cmt_policy & CMT_BALANCE) == 0) { 13451a77c24bSEric Saxe cmt_hier_promote(child, NULL); 13460e751525SEric Saxe } 13470e751525SEric Saxe } 13480e751525SEric Saxe pg->cmt_policy = CMT_BALANCE; 13490e751525SEric Saxe } 13500e751525SEric Saxe return (0); 13510e751525SEric Saxe } 13520e751525SEric Saxe 13530e751525SEric Saxe /* ARGSUSED */ 13540e751525SEric Saxe static void 13550e751525SEric Saxe cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 13560e751525SEric Saxe kthread_t *new) 13570e751525SEric Saxe { 13580e751525SEric Saxe pg_cmt_t *cmt_pg = (pg_cmt_t *)pg; 13590e751525SEric Saxe 13600e751525SEric Saxe if (old == cp->cpu_idle_thread) { 13611a5e258fSJosef 'Jeff' Sipek atomic_inc_32(&cmt_pg->cmt_utilization); 13620e751525SEric Saxe } else if (new == cp->cpu_idle_thread) { 13631a5e258fSJosef 'Jeff' Sipek atomic_dec_32(&cmt_pg->cmt_utilization); 13640e751525SEric Saxe } 13650e751525SEric Saxe } 13660e751525SEric Saxe 13670e751525SEric Saxe /* 13680e751525SEric Saxe * Macro to test whether a thread is currently runnable on a CPU in a PG. 13690e751525SEric Saxe */ 13700e751525SEric Saxe #define THREAD_RUNNABLE_IN_PG(t, pg) \ 13710e751525SEric Saxe ((t)->t_state == TS_RUN && \ 13720e751525SEric Saxe (t)->t_disp_queue->disp_cpu && \ 13730e751525SEric Saxe bitset_in_set(&(pg)->cmt_cpus_actv_set, \ 13740e751525SEric Saxe (t)->t_disp_queue->disp_cpu->cpu_seqid)) 13750e751525SEric Saxe 13760e751525SEric Saxe static void 13770e751525SEric Saxe cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old, 13780e751525SEric Saxe kthread_t *new) 13790e751525SEric Saxe { 13800e751525SEric Saxe pg_cmt_t *cmt = (pg_cmt_t *)pg; 13810e751525SEric Saxe cpupm_domain_t *dom; 13820e751525SEric Saxe uint32_t u; 13830e751525SEric Saxe 13840e751525SEric Saxe if (old == cp->cpu_idle_thread) { 13850e751525SEric Saxe ASSERT(new != cp->cpu_idle_thread); 13861a5e258fSJosef 'Jeff' Sipek u = atomic_inc_32_nv(&cmt->cmt_utilization); 13870e751525SEric Saxe if (u == 1) { 13880e751525SEric Saxe /* 13890e751525SEric Saxe * Notify the CPU power manager that the domain 13900e751525SEric Saxe * is non-idle. 13910e751525SEric Saxe */ 13920e751525SEric Saxe dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 13930e751525SEric Saxe cpupm_utilization_event(cp, now, dom, 13940e751525SEric Saxe CPUPM_DOM_BUSY_FROM_IDLE); 13950e751525SEric Saxe } 13960e751525SEric Saxe } else if (new == cp->cpu_idle_thread) { 13970e751525SEric Saxe ASSERT(old != cp->cpu_idle_thread); 13981a5e258fSJosef 'Jeff' Sipek u = atomic_dec_32_nv(&cmt->cmt_utilization); 13990e751525SEric Saxe if (u == 0) { 14000e751525SEric Saxe /* 14010e751525SEric Saxe * The domain is idle, notify the CPU power 14020e751525SEric Saxe * manager. 14030e751525SEric Saxe * 14040e751525SEric Saxe * Avoid notifying if the thread is simply migrating 14050e751525SEric Saxe * between CPUs in the domain. 14060e751525SEric Saxe */ 14070e751525SEric Saxe if (!THREAD_RUNNABLE_IN_PG(old, cmt)) { 14080e751525SEric Saxe dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 14090e751525SEric Saxe cpupm_utilization_event(cp, now, dom, 14100e751525SEric Saxe CPUPM_DOM_IDLE_FROM_BUSY); 14110e751525SEric Saxe } 14120e751525SEric Saxe } 14130e751525SEric Saxe } 14140e751525SEric Saxe } 14150e751525SEric Saxe 14160e751525SEric Saxe /* ARGSUSED */ 14170e751525SEric Saxe static void 14180e751525SEric Saxe cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t) 14190e751525SEric Saxe { 14200e751525SEric Saxe pg_cmt_t *cmt = (pg_cmt_t *)pg; 14210e751525SEric Saxe cpupm_domain_t *dom; 14220e751525SEric Saxe 14230e751525SEric Saxe dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle; 14240e751525SEric Saxe cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY); 14250e751525SEric Saxe } 14260e751525SEric Saxe 14270e751525SEric Saxe /* 14280e751525SEric Saxe * Return the name of the CMT scheduling policy 14290e751525SEric Saxe * being implemented across this PG 14300e751525SEric Saxe */ 14310e751525SEric Saxe static char * 14320e751525SEric Saxe pg_cmt_policy_name(pg_t *pg) 14330e751525SEric Saxe { 14340e751525SEric Saxe pg_cmt_policy_t policy; 14350e751525SEric Saxe 14360e751525SEric Saxe policy = ((pg_cmt_t *)pg)->cmt_policy; 14370e751525SEric Saxe 14380e751525SEric Saxe if (policy & CMT_AFFINITY) { 14390e751525SEric Saxe if (policy & CMT_BALANCE) 14400e751525SEric Saxe return ("Load Balancing & Affinity"); 14410e751525SEric Saxe else if (policy & CMT_COALESCE) 14420e751525SEric Saxe return ("Load Coalescence & Affinity"); 14436890d023SEric Saxe else 14440e751525SEric Saxe return ("Affinity"); 14450e751525SEric Saxe } else { 14460e751525SEric Saxe if (policy & CMT_BALANCE) 14470e751525SEric Saxe return ("Load Balancing"); 14480e751525SEric Saxe else if (policy & CMT_COALESCE) 14490e751525SEric Saxe return ("Load Coalescence"); 14500e751525SEric Saxe else 14510e751525SEric Saxe return ("None"); 14520e751525SEric Saxe } 14530e751525SEric Saxe } 14546890d023SEric Saxe 14556890d023SEric Saxe /* 14560e751525SEric Saxe * Prune PG, and all other instances of PG's hardware sharing relationship 1457d0e93b69SEric Saxe * from the CMT PG hierarchy. 14581a77c24bSEric Saxe * 14591a77c24bSEric Saxe * This routine operates on the CPU specific processor group data (for the CPUs 14601a77c24bSEric Saxe * in the PG being pruned), and may be invoked from a context where one CPU's 14611a77c24bSEric Saxe * PG data is under construction. In this case the argument "pgdata", if not 14621a77c24bSEric Saxe * NULL, is a reference to the CPU's under-construction PG data. 14636890d023SEric Saxe */ 14640e751525SEric Saxe static int 14651a77c24bSEric Saxe pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 14660e751525SEric Saxe { 14670e751525SEric Saxe group_t *hwset, *children; 14680e751525SEric Saxe int i, j, r, size = *sz; 14690e751525SEric Saxe group_iter_t hw_iter, child_iter; 14700e751525SEric Saxe pg_cpu_itr_t cpu_iter; 14710e751525SEric Saxe pg_cmt_t *pg, *child; 14720e751525SEric Saxe cpu_t *cpu; 14730e751525SEric Saxe int cap_needed; 14740e751525SEric Saxe pghw_type_t hw; 14756890d023SEric Saxe 14760e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 14776890d023SEric Saxe 1478d3c97224SAlexander Kolbasov /* 1479d3c97224SAlexander Kolbasov * Inform pghw layer that this PG is pruned. 1480d3c97224SAlexander Kolbasov */ 1481d3c97224SAlexander Kolbasov pghw_cmt_fini((pghw_t *)pg_bad); 1482d3c97224SAlexander Kolbasov 14830e751525SEric Saxe hw = ((pghw_t *)pg_bad)->pghw_hw; 14840e751525SEric Saxe 14850e751525SEric Saxe if (hw == PGHW_POW_ACTIVE) { 14860e751525SEric Saxe cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. " 14870e751525SEric Saxe "Event Based CPUPM Unavailable"); 14880e751525SEric Saxe } else if (hw == PGHW_POW_IDLE) { 14890e751525SEric Saxe cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. " 14900e751525SEric Saxe "Dispatcher assisted CPUPM disabled."); 14910e751525SEric Saxe } 14926890d023SEric Saxe 14936890d023SEric Saxe /* 14940e751525SEric Saxe * Find and eliminate the PG from the lineage. 14956890d023SEric Saxe */ 14960e751525SEric Saxe for (i = 0; i < size; i++) { 14970e751525SEric Saxe if (lineage[i] == pg_bad) { 14980e751525SEric Saxe for (j = i; j < size - 1; j++) 14990e751525SEric Saxe lineage[j] = lineage[j + 1]; 15000e751525SEric Saxe *sz = size - 1; 15010e751525SEric Saxe break; 15020e751525SEric Saxe } 15030e751525SEric Saxe } 15040e751525SEric Saxe 15050e751525SEric Saxe /* 15060e751525SEric Saxe * We'll prune all instances of the hardware sharing relationship 15070e751525SEric Saxe * represented by pg. But before we do that (and pause CPUs) we need 15080e751525SEric Saxe * to ensure the hierarchy's groups are properly sized. 15090e751525SEric Saxe */ 15100e751525SEric Saxe hwset = pghw_set_lookup(hw); 15110e751525SEric Saxe 15120e751525SEric Saxe /* 1513d0e93b69SEric Saxe * Blacklist the hardware so future processor groups of this type won't 1514d0e93b69SEric Saxe * participate in CMT thread placement. 1515d0e93b69SEric Saxe * 1516d0e93b69SEric Saxe * XXX 1517d0e93b69SEric Saxe * For heterogeneous system configurations, this might be overkill. 1518d0e93b69SEric Saxe * We may only need to blacklist the illegal PGs, and other instances 1519d0e93b69SEric Saxe * of this hardware sharing relationship may be ok. 15200e751525SEric Saxe */ 15210e751525SEric Saxe cmt_hw_blacklisted[hw] = 1; 15220e751525SEric Saxe 15230e751525SEric Saxe /* 15240e751525SEric Saxe * For each of the PGs being pruned, ensure sufficient capacity in 15250e751525SEric Saxe * the siblings set for the PG's children 15260e751525SEric Saxe */ 15270e751525SEric Saxe group_iter_init(&hw_iter); 15280e751525SEric Saxe while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 15290e751525SEric Saxe /* 15300e751525SEric Saxe * PG is being pruned, but if it is bringing up more than 15310e751525SEric Saxe * one child, ask for more capacity in the siblings group. 15320e751525SEric Saxe */ 15330e751525SEric Saxe cap_needed = 0; 15340e751525SEric Saxe if (pg->cmt_children && 15350e751525SEric Saxe GROUP_SIZE(pg->cmt_children) > 1) { 15360e751525SEric Saxe cap_needed = GROUP_SIZE(pg->cmt_children) - 1; 15370e751525SEric Saxe 15380e751525SEric Saxe group_expand(pg->cmt_siblings, 15390e751525SEric Saxe GROUP_SIZE(pg->cmt_siblings) + cap_needed); 15400e751525SEric Saxe 15410e751525SEric Saxe /* 15420e751525SEric Saxe * If this is a top level group, also ensure the 15430e751525SEric Saxe * capacity in the root lgrp level CMT grouping. 15440e751525SEric Saxe */ 15450e751525SEric Saxe if (pg->cmt_parent == NULL && 15460e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) { 15470e751525SEric Saxe group_expand(&cmt_root->cl_pgs, 15480e751525SEric Saxe GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed); 1549d0e93b69SEric Saxe cmt_root->cl_npgs += cap_needed; 15500e751525SEric Saxe } 15510e751525SEric Saxe } 15520e751525SEric Saxe } 15530e751525SEric Saxe 15540e751525SEric Saxe /* 15550e751525SEric Saxe * We're operating on the PG hierarchy. Pause CPUs to ensure 15560e751525SEric Saxe * exclusivity with respect to the dispatcher. 15570e751525SEric Saxe */ 1558*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL); 15590e751525SEric Saxe 15600e751525SEric Saxe /* 15610e751525SEric Saxe * Prune all PG instances of the hardware sharing relationship 15620e751525SEric Saxe * represented by pg. 15630e751525SEric Saxe */ 15640e751525SEric Saxe group_iter_init(&hw_iter); 15650e751525SEric Saxe while ((pg = group_iterate(hwset, &hw_iter)) != NULL) { 15660e751525SEric Saxe 15670e751525SEric Saxe /* 15680e751525SEric Saxe * Remove PG from it's group of siblings, if it's there. 15690e751525SEric Saxe */ 15700e751525SEric Saxe if (pg->cmt_siblings) { 15710e751525SEric Saxe (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 15720e751525SEric Saxe } 15730e751525SEric Saxe if (pg->cmt_parent == NULL && 15740e751525SEric Saxe pg->cmt_siblings != &cmt_root->cl_pgs) { 15750e751525SEric Saxe (void) group_remove(&cmt_root->cl_pgs, pg, 15760e751525SEric Saxe GRP_NORESIZE); 15770e751525SEric Saxe } 1578d0e93b69SEric Saxe 1579d0e93b69SEric Saxe /* 1580d0e93b69SEric Saxe * Indicate that no CMT policy will be implemented across 1581d0e93b69SEric Saxe * this PG. 1582d0e93b69SEric Saxe */ 1583d0e93b69SEric Saxe pg->cmt_policy = CMT_NO_POLICY; 1584d0e93b69SEric Saxe 15850e751525SEric Saxe /* 1586ef4f35d8SEric Saxe * Move PG's children from it's children set to it's parent's 1587ef4f35d8SEric Saxe * children set. Note that the parent's children set, and PG's 1588ef4f35d8SEric Saxe * siblings set are the same thing. 1589ef4f35d8SEric Saxe * 1590ef4f35d8SEric Saxe * Because we are iterating over the same group that we are 1591ef4f35d8SEric Saxe * operating on (removing the children), first add all of PG's 1592ef4f35d8SEric Saxe * children to the parent's children set, and once we are done 1593ef4f35d8SEric Saxe * iterating, empty PG's children set. 15940e751525SEric Saxe */ 15950e751525SEric Saxe if (pg->cmt_children != NULL) { 15960e751525SEric Saxe children = pg->cmt_children; 15970e751525SEric Saxe 15980e751525SEric Saxe group_iter_init(&child_iter); 15990e751525SEric Saxe while ((child = group_iterate(children, &child_iter)) 16000e751525SEric Saxe != NULL) { 1601ef4f35d8SEric Saxe if (pg->cmt_siblings != NULL) { 16020e751525SEric Saxe r = group_add(pg->cmt_siblings, child, 16030e751525SEric Saxe GRP_NORESIZE); 16040e751525SEric Saxe ASSERT(r == 0); 1605d0e93b69SEric Saxe 1606d0e93b69SEric Saxe if (pg->cmt_parent == NULL && 1607d0e93b69SEric Saxe pg->cmt_siblings != 1608d0e93b69SEric Saxe &cmt_root->cl_pgs) { 1609d0e93b69SEric Saxe r = group_add(&cmt_root->cl_pgs, 1610d0e93b69SEric Saxe child, GRP_NORESIZE); 1611d0e93b69SEric Saxe ASSERT(r == 0); 1612d0e93b69SEric Saxe } 16130e751525SEric Saxe } 16140e751525SEric Saxe } 1615ef4f35d8SEric Saxe group_empty(pg->cmt_children); 16160e751525SEric Saxe } 16170e751525SEric Saxe 16180e751525SEric Saxe /* 16190e751525SEric Saxe * Reset the callbacks to the defaults 16200e751525SEric Saxe */ 16210e751525SEric Saxe pg_callback_set_defaults((pg_t *)pg); 16220e751525SEric Saxe 16230e751525SEric Saxe /* 16240e751525SEric Saxe * Update all the CPU lineages in each of PG's CPUs 16250e751525SEric Saxe */ 16260e751525SEric Saxe PG_CPU_ITR_INIT(pg, cpu_iter); 16270e751525SEric Saxe while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) { 16280e751525SEric Saxe pg_cmt_t *cpu_pg; 16290e751525SEric Saxe group_iter_t liter; /* Iterator for the lineage */ 16301a77c24bSEric Saxe cpu_pg_t *cpd; /* CPU's PG data */ 16311a77c24bSEric Saxe 16321a77c24bSEric Saxe /* 16331a77c24bSEric Saxe * The CPU's lineage is under construction still 16341a77c24bSEric Saxe * references the bootstrap CPU PG data structure. 16351a77c24bSEric Saxe */ 16361a77c24bSEric Saxe if (pg_cpu_is_bootstrapped(cpu)) 16371a77c24bSEric Saxe cpd = pgdata; 16381a77c24bSEric Saxe else 16391a77c24bSEric Saxe cpd = cpu->cpu_pg; 16400e751525SEric Saxe 16410e751525SEric Saxe /* 16420e751525SEric Saxe * Iterate over the CPU's PGs updating the children 16430e751525SEric Saxe * of the PG being promoted, since they have a new 16440e751525SEric Saxe * parent and siblings set. 16450e751525SEric Saxe */ 16460e751525SEric Saxe group_iter_init(&liter); 16471a77c24bSEric Saxe while ((cpu_pg = group_iterate(&cpd->pgs, 16481a77c24bSEric Saxe &liter)) != NULL) { 16490e751525SEric Saxe if (cpu_pg->cmt_parent == pg) { 16500e751525SEric Saxe cpu_pg->cmt_parent = pg->cmt_parent; 16510e751525SEric Saxe cpu_pg->cmt_siblings = pg->cmt_siblings; 16520e751525SEric Saxe } 16530e751525SEric Saxe } 16540e751525SEric Saxe 16550e751525SEric Saxe /* 16560e751525SEric Saxe * Update the CPU's lineages 1657d0e93b69SEric Saxe * 1658d0e93b69SEric Saxe * Remove the PG from the CPU's group used for CMT 1659d0e93b69SEric Saxe * scheduling. 16600e751525SEric Saxe */ 16611a77c24bSEric Saxe (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE); 16620e751525SEric Saxe } 16630e751525SEric Saxe } 16640e751525SEric Saxe start_cpus(); 16650e751525SEric Saxe return (0); 16660e751525SEric Saxe } 16670e751525SEric Saxe 16680e751525SEric Saxe /* 16690e751525SEric Saxe * Disable CMT scheduling 16700e751525SEric Saxe */ 16710e751525SEric Saxe static void 16720e751525SEric Saxe pg_cmt_disable(void) 16730e751525SEric Saxe { 16740e751525SEric Saxe cpu_t *cpu; 16750e751525SEric Saxe 16761a77c24bSEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 16771a77c24bSEric Saxe 1678*0ed5c46eSJosef 'Jeff' Sipek pause_cpus(NULL, NULL); 16790e751525SEric Saxe cpu = cpu_list; 16800e751525SEric Saxe 16816890d023SEric Saxe do { 16820e751525SEric Saxe if (cpu->cpu_pg) 16830e751525SEric Saxe group_empty(&cpu->cpu_pg->cmt_pgs); 16840e751525SEric Saxe } while ((cpu = cpu->cpu_next) != cpu_list); 16850e751525SEric Saxe 16860e751525SEric Saxe cmt_sched_disabled = 1; 16870e751525SEric Saxe start_cpus(); 16880e751525SEric Saxe cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable"); 16890e751525SEric Saxe } 16900e751525SEric Saxe 1691ef4f35d8SEric Saxe /* 1692ef4f35d8SEric Saxe * CMT lineage validation 1693ef4f35d8SEric Saxe * 1694ef4f35d8SEric Saxe * This routine is invoked by pg_cmt_cpu_init() to validate the integrity 1695ef4f35d8SEric Saxe * of the PGs in a CPU's lineage. This is necessary because it's possible that 1696ef4f35d8SEric Saxe * some groupings (power domain groupings in particular) may be defined by 1697ef4f35d8SEric Saxe * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be 1698ef4f35d8SEric Saxe * possible to integrate those groupings into the CMT PG hierarchy, if doing 1699ef4f35d8SEric Saxe * so would violate the subset invariant of the hierarchy, which says that 1700ef4f35d8SEric Saxe * a PG must be subset of its parent (if it has one). 1701ef4f35d8SEric Saxe * 1702ef4f35d8SEric Saxe * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that 1703ef4f35d8SEric Saxe * would result in a violation of this invariant. If a violation is found, 1704ef4f35d8SEric Saxe * and the PG is of a grouping type who's definition is known to originate from 1705ef4f35d8SEric Saxe * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the 1706b025faeeSEric Saxe * PG (and all other instances PG's sharing relationship type) from the CMT 1707ef4f35d8SEric Saxe * hierarchy. Further, future instances of that sharing relationship type won't 1708b025faeeSEric Saxe * be added. If the grouping definition doesn't originate from suspect 1709ef4f35d8SEric Saxe * sources, then pg_cmt_disable() will be invoked to log an error, and disable 1710ef4f35d8SEric Saxe * CMT scheduling altogether. 1711ef4f35d8SEric Saxe * 1712ef4f35d8SEric Saxe * This routine is invoked after the CPU has been added to the PGs in which 1713ef4f35d8SEric Saxe * it belongs, but before those PGs have been added to (or had their place 1714ef4f35d8SEric Saxe * adjusted in) the CMT PG hierarchy. 1715ef4f35d8SEric Saxe * 1716ef4f35d8SEric Saxe * The first argument is the CPUs PG lineage (essentially an array of PGs in 1717ef4f35d8SEric Saxe * which the CPU belongs) that has already been sorted in ascending order 1718ef4f35d8SEric Saxe * by CPU count. Some of the PGs in the CPUs lineage may already have other 1719ef4f35d8SEric Saxe * CPUs in them, and have already been integrated into the CMT hierarchy. 1720ef4f35d8SEric Saxe * 1721ef4f35d8SEric Saxe * The addition of this new CPU to these pre-existing PGs means that those 1722ef4f35d8SEric Saxe * PGs may need to be promoted up in the hierarchy to satisfy the subset 1723ef4f35d8SEric Saxe * invariant. In additon to testing the subset invariant for the lineage, 1724ef4f35d8SEric Saxe * this routine also verifies that the addition of the new CPU to the 1725ef4f35d8SEric Saxe * existing PGs wouldn't cause the subset invariant to be violated in 1726ef4f35d8SEric Saxe * the exiting lineages. 1727ef4f35d8SEric Saxe * 1728ef4f35d8SEric Saxe * This routine will normally return one of the following: 1729ef4f35d8SEric Saxe * CMT_LINEAGE_VALID - There were no problems detected with the lineage. 1730ef4f35d8SEric Saxe * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning. 1731ef4f35d8SEric Saxe * 1732ef4f35d8SEric Saxe * Otherwise, this routine will return a value indicating which error it 1733ef4f35d8SEric Saxe * was unable to recover from (and set cmt_lineage_status along the way). 17341a77c24bSEric Saxe * 17351a77c24bSEric Saxe * This routine operates on the CPU specific processor group data (for the CPU 17361a77c24bSEric Saxe * whose lineage is being validated), which is under-construction. 17371a77c24bSEric Saxe * "pgdata" is a reference to the CPU's under-construction PG data. 17381a77c24bSEric Saxe * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg. 1739ef4f35d8SEric Saxe */ 1740ef4f35d8SEric Saxe static cmt_lineage_validation_t 17411a77c24bSEric Saxe pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata) 17420e751525SEric Saxe { 1743ef4f35d8SEric Saxe int i, j, size; 1744b025faeeSEric Saxe pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp, *parent; 17450e751525SEric Saxe cpu_t *cp; 17460e751525SEric Saxe pg_cpu_itr_t cpu_iter; 1747ef4f35d8SEric Saxe lgrp_handle_t lgrp; 17480e751525SEric Saxe 17490e751525SEric Saxe ASSERT(MUTEX_HELD(&cpu_lock)); 17500e751525SEric Saxe 17510e751525SEric Saxe revalidate: 17520e751525SEric Saxe size = *sz; 17530e751525SEric Saxe pg_bad = NULL; 1754ef4f35d8SEric Saxe lgrp = LGRP_NULL_HANDLE; 1755ef4f35d8SEric Saxe for (i = 0; i < size; i++) { 17560e751525SEric Saxe 17570e751525SEric Saxe pg = lineage[i]; 1758ef4f35d8SEric Saxe if (i < size - 1) 1759ef4f35d8SEric Saxe pg_next = lineage[i + 1]; 1760ef4f35d8SEric Saxe else 1761ef4f35d8SEric Saxe pg_next = NULL; 17626890d023SEric Saxe 17636890d023SEric Saxe /* 17640e751525SEric Saxe * We assume that the lineage has already been sorted 17650e751525SEric Saxe * by the number of CPUs. In fact, we depend on it. 17666890d023SEric Saxe */ 1767ef4f35d8SEric Saxe ASSERT(pg_next == NULL || 1768ef4f35d8SEric Saxe (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next))); 17696890d023SEric Saxe 17706890d023SEric Saxe /* 1771b025faeeSEric Saxe * The CPUs PG lineage was passed as the first argument to 1772b025faeeSEric Saxe * this routine and contains the sorted list of the CPU's 1773b025faeeSEric Saxe * PGs. Ultimately, the ordering of the PGs in that list, and 1774b025faeeSEric Saxe * the ordering as traversed by the cmt_parent list must be 1775b025faeeSEric Saxe * the same. PG promotion will be used as the mechanism to 1776b025faeeSEric Saxe * achieve this, but first we need to look for cases where 1777b025faeeSEric Saxe * promotion will be necessary, and validate that will be 1778b025faeeSEric Saxe * possible without violating the subset invarient described 1779b025faeeSEric Saxe * above. 1780ef4f35d8SEric Saxe * 1781ef4f35d8SEric Saxe * Since the PG topology is in the middle of being changed, we 1782ef4f35d8SEric Saxe * need to check whether the PG's existing parent (if any) is 1783b025faeeSEric Saxe * part of this CPU's lineage (and therefore should contain 1784b025faeeSEric Saxe * the new CPU). If not, it means that the addition of the 1785b025faeeSEric Saxe * new CPU should have made this PG have more CPUs than its 1786b025faeeSEric Saxe * parent (and other ancestors not in the same lineage) and 1787b025faeeSEric Saxe * will need to be promoted into place. 1788b025faeeSEric Saxe * 1789b025faeeSEric Saxe * We need to verify all of this to defend against a buggy 1790ef4f35d8SEric Saxe * BIOS giving bad power domain CPU groupings. Sigh. 1791ef4f35d8SEric Saxe */ 1792b025faeeSEric Saxe parent = pg->cmt_parent; 1793b025faeeSEric Saxe while (parent != NULL) { 1794ef4f35d8SEric Saxe /* 1795b025faeeSEric Saxe * Determine if the parent/ancestor is in this lineage 1796ef4f35d8SEric Saxe */ 1797b025faeeSEric Saxe pg_tmp = NULL; 1798b025faeeSEric Saxe for (j = 0; (j < size) && (pg_tmp != parent); j++) { 1799ef4f35d8SEric Saxe pg_tmp = lineage[j]; 1800b025faeeSEric Saxe } 1801b025faeeSEric Saxe if (pg_tmp == parent) { 1802b025faeeSEric Saxe /* 1803b025faeeSEric Saxe * It's in the lineage. The concentricity 1804b025faeeSEric Saxe * checks will handle the rest. 1805b025faeeSEric Saxe */ 1806ef4f35d8SEric Saxe break; 1807ef4f35d8SEric Saxe } 1808ef4f35d8SEric Saxe /* 1809b025faeeSEric Saxe * If it is not in the lineage, PG will eventually 1810b025faeeSEric Saxe * need to be promoted above it. Verify the ancestor 1811b025faeeSEric Saxe * is a proper subset. There is still an error if 1812b025faeeSEric Saxe * the ancestor has the same number of CPUs as PG, 1813b025faeeSEric Saxe * since that would imply it should be in the lineage, 1814b025faeeSEric Saxe * and we already know it isn't. 1815ef4f35d8SEric Saxe */ 1816b025faeeSEric Saxe if (PG_NUM_CPUS((pg_t *)parent) >= 1817ef4f35d8SEric Saxe PG_NUM_CPUS((pg_t *)pg)) { 1818ef4f35d8SEric Saxe /* 1819b025faeeSEric Saxe * Not a proper subset if the parent/ancestor 1820b025faeeSEric Saxe * has the same or more CPUs than PG. 1821ef4f35d8SEric Saxe */ 1822b025faeeSEric Saxe cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE; 1823ef4f35d8SEric Saxe goto handle_error; 1824ef4f35d8SEric Saxe } 1825b025faeeSEric Saxe parent = parent->cmt_parent; 1826ef4f35d8SEric Saxe } 1827ef4f35d8SEric Saxe 1828ef4f35d8SEric Saxe /* 1829ef4f35d8SEric Saxe * Walk each of the CPUs in the PGs group and perform 1830ef4f35d8SEric Saxe * consistency checks along the way. 18316890d023SEric Saxe */ 18320e751525SEric Saxe PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter); 18330e751525SEric Saxe while ((cp = pg_cpu_next(&cpu_iter)) != NULL) { 1834ef4f35d8SEric Saxe /* 1835ef4f35d8SEric Saxe * Verify that there aren't any CPUs contained in PG 1836ef4f35d8SEric Saxe * that the next PG in the lineage (which is larger 1837ef4f35d8SEric Saxe * or same size) doesn't also contain. 1838ef4f35d8SEric Saxe */ 1839ef4f35d8SEric Saxe if (pg_next != NULL && 1840ef4f35d8SEric Saxe pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) { 18410e751525SEric Saxe cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC; 18420e751525SEric Saxe goto handle_error; 18436890d023SEric Saxe } 1844ef4f35d8SEric Saxe 1845ef4f35d8SEric Saxe /* 1846ef4f35d8SEric Saxe * Verify that all the CPUs in the PG are in the same 1847ef4f35d8SEric Saxe * lgroup. 1848ef4f35d8SEric Saxe */ 1849ef4f35d8SEric Saxe if (lgrp == LGRP_NULL_HANDLE) { 1850ef4f35d8SEric Saxe lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id); 1851ef4f35d8SEric Saxe } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) { 1852ef4f35d8SEric Saxe cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS; 1853ef4f35d8SEric Saxe goto handle_error; 1854ef4f35d8SEric Saxe } 18550e751525SEric Saxe } 18566890d023SEric Saxe } 18576890d023SEric Saxe 18580e751525SEric Saxe handle_error: 1859ef4f35d8SEric Saxe /* 1860ef4f35d8SEric Saxe * Some of these validation errors can result when the CPU grouping 1861ef4f35d8SEric Saxe * information is derived from buggy sources (for example, incorrect 1862ef4f35d8SEric Saxe * ACPI tables on x86 systems). 1863ef4f35d8SEric Saxe * 1864ef4f35d8SEric Saxe * We'll try to recover in such cases by pruning out the illegal 1865ef4f35d8SEric Saxe * groupings from the PG hierarchy, which means that we won't optimize 1866ef4f35d8SEric Saxe * for those levels, but we will for the remaining ones. 1867ef4f35d8SEric Saxe */ 18680e751525SEric Saxe switch (cmt_lineage_status) { 18690e751525SEric Saxe case CMT_LINEAGE_VALID: 18700e751525SEric Saxe case CMT_LINEAGE_REPAIRED: 18710e751525SEric Saxe break; 1872ef4f35d8SEric Saxe case CMT_LINEAGE_PG_SPANS_LGRPS: 1873ef4f35d8SEric Saxe /* 1874ef4f35d8SEric Saxe * We've detected a PG whose CPUs span lgroups. 1875ef4f35d8SEric Saxe * 1876ef4f35d8SEric Saxe * This isn't supported, as the dispatcher isn't allowed to 1877ef4f35d8SEric Saxe * to do CMT thread placement across lgroups, as this would 1878ef4f35d8SEric Saxe * conflict with policies implementing MPO thread affinity. 1879ef4f35d8SEric Saxe * 1880d0e93b69SEric Saxe * If the PG is of a sharing relationship type known to 1881d0e93b69SEric Saxe * legitimately span lgroups, specify that no CMT thread 1882d0e93b69SEric Saxe * placement policy should be implemented, and prune the PG 1883d0e93b69SEric Saxe * from the existing CMT PG hierarchy. 1884d0e93b69SEric Saxe * 1885d0e93b69SEric Saxe * Otherwise, fall though to the case below for handling. 1886ef4f35d8SEric Saxe */ 1887d0e93b69SEric Saxe if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) { 1888d0e93b69SEric Saxe if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1889d0e93b69SEric Saxe cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1890d0e93b69SEric Saxe goto revalidate; 1891d0e93b69SEric Saxe } 1892d0e93b69SEric Saxe } 1893d0e93b69SEric Saxe /*LINTED*/ 1894ef4f35d8SEric Saxe case CMT_LINEAGE_NON_PROMOTABLE: 1895ef4f35d8SEric Saxe /* 1896ef4f35d8SEric Saxe * We've detected a PG that already exists in another CPU's 1897ef4f35d8SEric Saxe * lineage that cannot cannot legally be promoted into place 1898ef4f35d8SEric Saxe * without breaking the invariants of the hierarchy. 1899ef4f35d8SEric Saxe */ 1900ef4f35d8SEric Saxe if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 19011a77c24bSEric Saxe if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) { 1902ef4f35d8SEric Saxe cmt_lineage_status = CMT_LINEAGE_REPAIRED; 1903ef4f35d8SEric Saxe goto revalidate; 1904ef4f35d8SEric Saxe } 1905ef4f35d8SEric Saxe } 1906ef4f35d8SEric Saxe /* 1907ef4f35d8SEric Saxe * Something went wrong trying to prune out the bad level. 1908ef4f35d8SEric Saxe * Disable CMT scheduling altogether. 1909ef4f35d8SEric Saxe */ 1910ef4f35d8SEric Saxe pg_cmt_disable(); 1911ef4f35d8SEric Saxe break; 19120e751525SEric Saxe case CMT_LINEAGE_NON_CONCENTRIC: 19136890d023SEric Saxe /* 1914ef4f35d8SEric Saxe * We've detected a non-concentric PG lineage, which means that 1915ef4f35d8SEric Saxe * there's a PG in the lineage that has CPUs that the next PG 1916ef4f35d8SEric Saxe * over in the lineage (which is the same size or larger) 1917ef4f35d8SEric Saxe * doesn't have. 19180e751525SEric Saxe * 1919ef4f35d8SEric Saxe * In this case, we examine the two PGs to see if either 1920ef4f35d8SEric Saxe * grouping is defined by potentially buggy sources. 19210e751525SEric Saxe * 19220e751525SEric Saxe * If one has less CPUs than the other, and contains CPUs 19230e751525SEric Saxe * not found in the parent, and it is an untrusted enumeration, 19240e751525SEric Saxe * then prune it. If both have the same number of CPUs, then 19250e751525SEric Saxe * prune the one that is untrusted. 19260e751525SEric Saxe * 19270e751525SEric Saxe * This process repeats until we have a concentric lineage, 19280e751525SEric Saxe * or we would have to prune out level derived from what we 19290e751525SEric Saxe * thought was a reliable source, in which case CMT scheduling 1930ef4f35d8SEric Saxe * is disabled altogether. 19316890d023SEric Saxe */ 1932ef4f35d8SEric Saxe if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) && 19330e751525SEric Saxe (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) { 19340e751525SEric Saxe pg_bad = pg; 19350e751525SEric Saxe } else if (PG_NUM_CPUS((pg_t *)pg) == 1936ef4f35d8SEric Saxe PG_NUM_CPUS((pg_t *)pg_next)) { 1937ef4f35d8SEric Saxe if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) { 1938ef4f35d8SEric Saxe pg_bad = pg_next; 19390e751525SEric Saxe } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) { 19400e751525SEric Saxe pg_bad = pg; 19416890d023SEric Saxe } 19426890d023SEric Saxe } 19430e751525SEric Saxe if (pg_bad) { 19441a77c24bSEric Saxe if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) { 19450e751525SEric Saxe cmt_lineage_status = CMT_LINEAGE_REPAIRED; 19460e751525SEric Saxe goto revalidate; 19470e751525SEric Saxe } 19480e751525SEric Saxe } 19490e751525SEric Saxe /* 1950ef4f35d8SEric Saxe * Something went wrong trying to identify and/or prune out 1951ef4f35d8SEric Saxe * the bad level. Disable CMT scheduling altogether. 19520e751525SEric Saxe */ 19530e751525SEric Saxe pg_cmt_disable(); 1954ef4f35d8SEric Saxe break; 1955ef4f35d8SEric Saxe default: 1956ef4f35d8SEric Saxe /* 1957ef4f35d8SEric Saxe * If we're here, we've encountered a validation error for 1958ef4f35d8SEric Saxe * which we don't know how to recover. In this case, disable 1959ef4f35d8SEric Saxe * CMT scheduling altogether. 1960ef4f35d8SEric Saxe */ 19610e751525SEric Saxe cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE; 1962ef4f35d8SEric Saxe pg_cmt_disable(); 19630e751525SEric Saxe } 1964ef4f35d8SEric Saxe return (cmt_lineage_status); 19656890d023SEric Saxe } 1966