1fb2f18f8Sesaxe /* 2fb2f18f8Sesaxe * CDDL HEADER START 3fb2f18f8Sesaxe * 4fb2f18f8Sesaxe * The contents of this file are subject to the terms of the 5fb2f18f8Sesaxe * Common Development and Distribution License (the "License"). 6fb2f18f8Sesaxe * You may not use this file except in compliance with the License. 7fb2f18f8Sesaxe * 8fb2f18f8Sesaxe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fb2f18f8Sesaxe * or http://www.opensolaris.org/os/licensing. 10fb2f18f8Sesaxe * See the License for the specific language governing permissions 11fb2f18f8Sesaxe * and limitations under the License. 12fb2f18f8Sesaxe * 13fb2f18f8Sesaxe * When distributing Covered Code, include this CDDL HEADER in each 14fb2f18f8Sesaxe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fb2f18f8Sesaxe * If applicable, add the following below this CDDL HEADER, with the 16fb2f18f8Sesaxe * fields enclosed by brackets "[]" replaced with your own identifying 17fb2f18f8Sesaxe * information: Portions Copyright [yyyy] [name of copyright owner] 18fb2f18f8Sesaxe * 19fb2f18f8Sesaxe * CDDL HEADER END 20fb2f18f8Sesaxe */ 21fb2f18f8Sesaxe /* 22c416da2dSjb145095 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23fb2f18f8Sesaxe * Use is subject to license terms. 24fb2f18f8Sesaxe */ 25fb2f18f8Sesaxe 26fb2f18f8Sesaxe #include <sys/systm.h> 27fb2f18f8Sesaxe #include <sys/types.h> 28fb2f18f8Sesaxe #include <sys/param.h> 29fb2f18f8Sesaxe #include <sys/thread.h> 30fb2f18f8Sesaxe #include <sys/cpuvar.h> 31fb2f18f8Sesaxe #include <sys/cpupart.h> 32fb2f18f8Sesaxe #include <sys/kmem.h> 33fb2f18f8Sesaxe #include <sys/cmn_err.h> 34fb2f18f8Sesaxe #include <sys/kstat.h> 35fb2f18f8Sesaxe #include <sys/processor.h> 36fb2f18f8Sesaxe #include <sys/disp.h> 37fb2f18f8Sesaxe #include <sys/group.h> 38fb2f18f8Sesaxe #include <sys/pghw.h> 39fb2f18f8Sesaxe #include <sys/bitset.h> 40fb2f18f8Sesaxe #include <sys/lgrp.h> 41fb2f18f8Sesaxe #include <sys/cmt.h> 42fb2f18f8Sesaxe 43fb2f18f8Sesaxe /* 44fb2f18f8Sesaxe * CMT scheduler / dispatcher support 45fb2f18f8Sesaxe * 46fb2f18f8Sesaxe * This file implements CMT scheduler support using Processor Groups. 47fb2f18f8Sesaxe * The CMT processor group class creates and maintains the CMT class 48fb2f18f8Sesaxe * specific processor group pg_cmt_t. 49fb2f18f8Sesaxe * 50fb2f18f8Sesaxe * ---------------------------- <-- pg_cmt_t * 51fb2f18f8Sesaxe * | pghw_t | 52fb2f18f8Sesaxe * ---------------------------- 53fb2f18f8Sesaxe * | CMT class specific data | 54fb2f18f8Sesaxe * | - hierarchy linkage | 55fb2f18f8Sesaxe * | - CMT load balancing data| 56fb2f18f8Sesaxe * | - active CPU group/bitset| 57fb2f18f8Sesaxe * ---------------------------- 58fb2f18f8Sesaxe * 59fb2f18f8Sesaxe * The scheduler/dispatcher leverages knowledge of the performance 60fb2f18f8Sesaxe * relevant CMT sharing relationships existing between cpus to implement 61fb2f18f8Sesaxe * optimized affinity and load balancing policies. 62fb2f18f8Sesaxe * 63fb2f18f8Sesaxe * Load balancing policy seeks to improve performance by minimizing 64fb2f18f8Sesaxe * contention over shared processor resources / facilities, while the 65fb2f18f8Sesaxe * affinity policies seek to improve cache and TLB utilization. 66fb2f18f8Sesaxe * 67fb2f18f8Sesaxe * The CMT PGs created by this class are already arranged into a 68fb2f18f8Sesaxe * hierarchy (which is done in the pghw layer). To implement the top-down 69fb2f18f8Sesaxe * CMT load balancing algorithm, the CMT PGs additionally maintain 70fb2f18f8Sesaxe * parent, child and sibling hierarchy relationships. 71fb2f18f8Sesaxe * Parent PGs always contain a superset of their children(s) resources, 72fb2f18f8Sesaxe * each PG can have at most one parent, and siblings are the group of PGs 73fb2f18f8Sesaxe * sharing the same parent. 74fb2f18f8Sesaxe * 75fb2f18f8Sesaxe * On NUMA systems, the CMT load balancing algorithm balances across the 76fb2f18f8Sesaxe * CMT PGs within their respective lgroups. On UMA based system, there 77fb2f18f8Sesaxe * exists a top level group of PGs to balance across. On NUMA systems multiple 78fb2f18f8Sesaxe * top level groups are instantiated, where the top level balancing begins by 79fb2f18f8Sesaxe * balancng across the CMT PGs within their respective (per lgroup) top level 80fb2f18f8Sesaxe * groups. 81fb2f18f8Sesaxe */ 82fb2f18f8Sesaxe typedef struct cmt_lgrp { 83fb2f18f8Sesaxe group_t cl_pgs; /* Top level group of active CMT PGs */ 84fb2f18f8Sesaxe int cl_npgs; /* # of top level PGs in the lgroup */ 85fb2f18f8Sesaxe lgrp_handle_t cl_hand; /* lgroup's platform handle */ 86fb2f18f8Sesaxe struct cmt_lgrp *cl_next; /* next cmt_lgrp */ 87fb2f18f8Sesaxe } cmt_lgrp_t; 88fb2f18f8Sesaxe 89a6604450Sesaxe static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 90a6604450Sesaxe static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 91a6604450Sesaxe /* used for null_proc_lpa */ 92*6890d023SEric Saxe static cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 93fb2f18f8Sesaxe 94a6604450Sesaxe static int is_cpu0 = 1; /* true if this is boot CPU context */ 95a6604450Sesaxe 96a6604450Sesaxe /* 97a6604450Sesaxe * Set this to non-zero to disable CMT scheduling 98a6604450Sesaxe * This must be done via kmdb -d, as /etc/system will be too late 99a6604450Sesaxe */ 100fb2f18f8Sesaxe static int cmt_sched_disabled = 0; 101fb2f18f8Sesaxe 102fb2f18f8Sesaxe static pg_cid_t pg_cmt_class_id; /* PG class id */ 103fb2f18f8Sesaxe 104fb2f18f8Sesaxe static pg_t *pg_cmt_alloc(); 105fb2f18f8Sesaxe static void pg_cmt_free(pg_t *); 106fb2f18f8Sesaxe static void pg_cmt_cpu_init(cpu_t *); 107fb2f18f8Sesaxe static void pg_cmt_cpu_fini(cpu_t *); 108fb2f18f8Sesaxe static void pg_cmt_cpu_active(cpu_t *); 109fb2f18f8Sesaxe static void pg_cmt_cpu_inactive(cpu_t *); 110fb2f18f8Sesaxe static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 111fb2f18f8Sesaxe static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 112*6890d023SEric Saxe static void pg_cmt_hier_pack(void **, int); 113fb2f18f8Sesaxe static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 114fb2f18f8Sesaxe static int pg_cmt_hw(pghw_type_t); 115fb2f18f8Sesaxe static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 116a6604450Sesaxe static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 117fb2f18f8Sesaxe 118fb2f18f8Sesaxe /* 119fb2f18f8Sesaxe * Macro to test if PG is managed by the CMT PG class 120fb2f18f8Sesaxe */ 121fb2f18f8Sesaxe #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 122fb2f18f8Sesaxe 123fb2f18f8Sesaxe /* 124fb2f18f8Sesaxe * CMT PG ops 125fb2f18f8Sesaxe */ 126fb2f18f8Sesaxe struct pg_ops pg_ops_cmt = { 127fb2f18f8Sesaxe pg_cmt_alloc, 128fb2f18f8Sesaxe pg_cmt_free, 129fb2f18f8Sesaxe pg_cmt_cpu_init, 130fb2f18f8Sesaxe pg_cmt_cpu_fini, 131fb2f18f8Sesaxe pg_cmt_cpu_active, 132fb2f18f8Sesaxe pg_cmt_cpu_inactive, 133fb2f18f8Sesaxe pg_cmt_cpupart_in, 134fb2f18f8Sesaxe NULL, /* cpupart_out */ 135fb2f18f8Sesaxe pg_cmt_cpupart_move, 136fb2f18f8Sesaxe pg_cmt_cpu_belongs, 137fb2f18f8Sesaxe }; 138fb2f18f8Sesaxe 139fb2f18f8Sesaxe /* 140fb2f18f8Sesaxe * Initialize the CMT PG class 141fb2f18f8Sesaxe */ 142fb2f18f8Sesaxe void 143fb2f18f8Sesaxe pg_cmt_class_init(void) 144fb2f18f8Sesaxe { 145fb2f18f8Sesaxe if (cmt_sched_disabled) 146fb2f18f8Sesaxe return; 147fb2f18f8Sesaxe 148fb2f18f8Sesaxe pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 149fb2f18f8Sesaxe } 150fb2f18f8Sesaxe 151fb2f18f8Sesaxe /* 152fb2f18f8Sesaxe * Called to indicate a new CPU has started up so 153fb2f18f8Sesaxe * that either t0 or the slave startup thread can 154fb2f18f8Sesaxe * be accounted for. 155fb2f18f8Sesaxe */ 156fb2f18f8Sesaxe void 157fb2f18f8Sesaxe pg_cmt_cpu_startup(cpu_t *cp) 158fb2f18f8Sesaxe { 159fb2f18f8Sesaxe PG_NRUN_UPDATE(cp, 1); 160fb2f18f8Sesaxe } 161fb2f18f8Sesaxe 162fb2f18f8Sesaxe /* 163fb2f18f8Sesaxe * Adjust the CMT load in the CMT PGs in which the CPU belongs 164fb2f18f8Sesaxe * Note that "n" can be positive in the case of increasing 165fb2f18f8Sesaxe * load, or negative in the case of decreasing load. 166fb2f18f8Sesaxe */ 167fb2f18f8Sesaxe void 168fb2f18f8Sesaxe pg_cmt_load(cpu_t *cp, int n) 169fb2f18f8Sesaxe { 170fb2f18f8Sesaxe pg_cmt_t *pg; 171fb2f18f8Sesaxe 172fb2f18f8Sesaxe pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 173fb2f18f8Sesaxe while (pg != NULL) { 174fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg)); 175fb2f18f8Sesaxe atomic_add_32(&pg->cmt_nrunning, n); 176fb2f18f8Sesaxe pg = pg->cmt_parent; 177fb2f18f8Sesaxe } 178fb2f18f8Sesaxe } 179fb2f18f8Sesaxe 180fb2f18f8Sesaxe /* 181fb2f18f8Sesaxe * Return non-zero if thread can migrate between "from" and "to" 182fb2f18f8Sesaxe * without a performance penalty 183fb2f18f8Sesaxe */ 184fb2f18f8Sesaxe int 185fb2f18f8Sesaxe pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 186fb2f18f8Sesaxe { 187fb2f18f8Sesaxe if (from->cpu_physid->cpu_cacheid == 188fb2f18f8Sesaxe to->cpu_physid->cpu_cacheid) 189fb2f18f8Sesaxe return (1); 190fb2f18f8Sesaxe return (0); 191fb2f18f8Sesaxe } 192fb2f18f8Sesaxe 193fb2f18f8Sesaxe /* 194fb2f18f8Sesaxe * CMT class specific PG allocation 195fb2f18f8Sesaxe */ 196fb2f18f8Sesaxe static pg_t * 197fb2f18f8Sesaxe pg_cmt_alloc(void) 198fb2f18f8Sesaxe { 199fb2f18f8Sesaxe return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 200fb2f18f8Sesaxe } 201fb2f18f8Sesaxe 202fb2f18f8Sesaxe /* 203fb2f18f8Sesaxe * Class specific PG de-allocation 204fb2f18f8Sesaxe */ 205fb2f18f8Sesaxe static void 206fb2f18f8Sesaxe pg_cmt_free(pg_t *pg) 207fb2f18f8Sesaxe { 208fb2f18f8Sesaxe ASSERT(pg != NULL); 209fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg)); 210fb2f18f8Sesaxe 211fb2f18f8Sesaxe kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 212fb2f18f8Sesaxe } 213fb2f18f8Sesaxe 214fb2f18f8Sesaxe /* 215fb2f18f8Sesaxe * Return 1 if CMT scheduling policies should be impelmented 216fb2f18f8Sesaxe * for the specified hardware sharing relationship. 217fb2f18f8Sesaxe */ 218fb2f18f8Sesaxe static int 219fb2f18f8Sesaxe pg_cmt_hw(pghw_type_t hw) 220fb2f18f8Sesaxe { 221d129bde2Sesaxe return (pg_plat_cmt_load_bal_hw(hw) || 222d129bde2Sesaxe pg_plat_cmt_affinity_hw(hw)); 223fb2f18f8Sesaxe } 224fb2f18f8Sesaxe 225fb2f18f8Sesaxe /* 226fb2f18f8Sesaxe * CMT class callback for a new CPU entering the system 227fb2f18f8Sesaxe */ 228fb2f18f8Sesaxe static void 229fb2f18f8Sesaxe pg_cmt_cpu_init(cpu_t *cp) 230fb2f18f8Sesaxe { 231fb2f18f8Sesaxe pg_cmt_t *pg; 232fb2f18f8Sesaxe group_t *cmt_pgs; 233fb2f18f8Sesaxe int level, max_level, nlevels; 234fb2f18f8Sesaxe pghw_type_t hw; 235fb2f18f8Sesaxe pg_t *pg_cache = NULL; 236fb2f18f8Sesaxe pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 237fb2f18f8Sesaxe lgrp_handle_t lgrp_handle; 238fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 239fb2f18f8Sesaxe 240fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 241fb2f18f8Sesaxe 242fb2f18f8Sesaxe /* 243fb2f18f8Sesaxe * A new CPU is coming into the system. 244fb2f18f8Sesaxe * Interrogate the platform to see if the CPU 245fb2f18f8Sesaxe * has any performance relevant CMT sharing 246fb2f18f8Sesaxe * relationships 247fb2f18f8Sesaxe */ 248fb2f18f8Sesaxe cmt_pgs = &cp->cpu_pg->cmt_pgs; 249fb2f18f8Sesaxe cp->cpu_pg->cmt_lineage = NULL; 250fb2f18f8Sesaxe 251fb2f18f8Sesaxe bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 252fb2f18f8Sesaxe max_level = nlevels = 0; 253fb2f18f8Sesaxe for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 254fb2f18f8Sesaxe 255fb2f18f8Sesaxe /* 256fb2f18f8Sesaxe * We're only interested in CMT hw sharing relationships 257fb2f18f8Sesaxe */ 258fb2f18f8Sesaxe if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0) 259fb2f18f8Sesaxe continue; 260fb2f18f8Sesaxe 261fb2f18f8Sesaxe /* 262fb2f18f8Sesaxe * Find (or create) the PG associated with 263fb2f18f8Sesaxe * the hw sharing relationship in which cp 264fb2f18f8Sesaxe * belongs. 265fb2f18f8Sesaxe * 266fb2f18f8Sesaxe * Determine if a suitable PG already 267fb2f18f8Sesaxe * exists, or if one needs to be created. 268fb2f18f8Sesaxe */ 269fb2f18f8Sesaxe pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 270fb2f18f8Sesaxe if (pg == NULL) { 271fb2f18f8Sesaxe /* 272fb2f18f8Sesaxe * Create a new one. 273fb2f18f8Sesaxe * Initialize the common... 274fb2f18f8Sesaxe */ 275fb2f18f8Sesaxe pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 276fb2f18f8Sesaxe 277fb2f18f8Sesaxe /* ... physical ... */ 278fb2f18f8Sesaxe pghw_init((pghw_t *)pg, cp, hw); 279fb2f18f8Sesaxe 280fb2f18f8Sesaxe /* 281fb2f18f8Sesaxe * ... and CMT specific portions of the 282fb2f18f8Sesaxe * structure. 283fb2f18f8Sesaxe */ 284fb2f18f8Sesaxe bitset_init(&pg->cmt_cpus_actv_set); 285fb2f18f8Sesaxe group_create(&pg->cmt_cpus_actv); 286fb2f18f8Sesaxe } else { 287fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg)); 288fb2f18f8Sesaxe } 289fb2f18f8Sesaxe 290fb2f18f8Sesaxe /* Add the CPU to the PG */ 291fb2f18f8Sesaxe pg_cpu_add((pg_t *)pg, cp); 292fb2f18f8Sesaxe 293fb2f18f8Sesaxe /* 294*6890d023SEric Saxe * Ensure capacity of the active CPU group/bitset 295fb2f18f8Sesaxe */ 296fb2f18f8Sesaxe group_expand(&pg->cmt_cpus_actv, 297fb2f18f8Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 298fb2f18f8Sesaxe 299fb2f18f8Sesaxe if (cp->cpu_seqid >= 300fb2f18f8Sesaxe bitset_capacity(&pg->cmt_cpus_actv_set)) { 301fb2f18f8Sesaxe bitset_resize(&pg->cmt_cpus_actv_set, 302fb2f18f8Sesaxe cp->cpu_seqid + 1); 303fb2f18f8Sesaxe } 304fb2f18f8Sesaxe 305fb2f18f8Sesaxe /* 306fb2f18f8Sesaxe * Build a lineage of CMT PGs for load balancing 307fb2f18f8Sesaxe */ 308d129bde2Sesaxe if (pg_plat_cmt_load_bal_hw(hw)) { 309fb2f18f8Sesaxe level = pghw_level(hw); 310fb2f18f8Sesaxe cpu_cmt_hier[level] = pg; 311fb2f18f8Sesaxe if (level > max_level) 312fb2f18f8Sesaxe max_level = level; 313fb2f18f8Sesaxe nlevels++; 314fb2f18f8Sesaxe } 315fb2f18f8Sesaxe 316fb2f18f8Sesaxe /* Cache this for later */ 317fb2f18f8Sesaxe if (hw == PGHW_CACHE) 318fb2f18f8Sesaxe pg_cache = (pg_t *)pg; 319fb2f18f8Sesaxe } 320fb2f18f8Sesaxe 321fb2f18f8Sesaxe /* 322*6890d023SEric Saxe * Pack out any gaps in the constructed lineage, 323*6890d023SEric Saxe * then size it out. 324*6890d023SEric Saxe * 325fb2f18f8Sesaxe * Gaps may exist where the architecture knows 326fb2f18f8Sesaxe * about a hardware sharing relationship, but such a 327fb2f18f8Sesaxe * relationship either isn't relevant for load 328fb2f18f8Sesaxe * balancing or doesn't exist between CPUs on the system. 329fb2f18f8Sesaxe */ 330*6890d023SEric Saxe pg_cmt_hier_pack((void **)cpu_cmt_hier, max_level + 1); 331*6890d023SEric Saxe group_expand(cmt_pgs, nlevels); 332*6890d023SEric Saxe 333*6890d023SEric Saxe 334*6890d023SEric Saxe if (cmt_root == NULL) 335*6890d023SEric Saxe cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 336fb2f18f8Sesaxe 337fb2f18f8Sesaxe /* 338*6890d023SEric Saxe * Find the lgrp that encapsulates this CPU's CMT hierarchy. 339*6890d023SEric Saxe * and locate/create a suitable cmt_lgrp_t. 340*6890d023SEric Saxe */ 341*6890d023SEric Saxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 342*6890d023SEric Saxe if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 343*6890d023SEric Saxe lgrp = pg_cmt_lgrp_create(lgrp_handle); 344*6890d023SEric Saxe 345*6890d023SEric Saxe /* 346*6890d023SEric Saxe * For each of the PGs in the CPU's lineage: 347*6890d023SEric Saxe * - Add an entry in the CPU's CMT PG group 348*6890d023SEric Saxe * which is used by the dispatcher to implement load balancing 349*6890d023SEric Saxe * policy. 350fb2f18f8Sesaxe * - Tie the PG into the CMT hierarchy by connecting 351fb2f18f8Sesaxe * it to it's parent and siblings. 352fb2f18f8Sesaxe */ 353fb2f18f8Sesaxe for (level = 0; level < nlevels; level++) { 354fb2f18f8Sesaxe uint_t children; 355fb2f18f8Sesaxe int err; 356fb2f18f8Sesaxe 357fb2f18f8Sesaxe pg = cpu_cmt_hier[level]; 358fb2f18f8Sesaxe err = group_add_at(cmt_pgs, pg, nlevels - level - 1); 359fb2f18f8Sesaxe ASSERT(err == 0); 360fb2f18f8Sesaxe 361fb2f18f8Sesaxe if (level == 0) 362fb2f18f8Sesaxe cp->cpu_pg->cmt_lineage = (pg_t *)pg; 363fb2f18f8Sesaxe 364fb2f18f8Sesaxe if (pg->cmt_siblings != NULL) { 365fb2f18f8Sesaxe /* Already initialized */ 366fb2f18f8Sesaxe ASSERT(pg->cmt_parent == NULL || 367fb2f18f8Sesaxe pg->cmt_parent == cpu_cmt_hier[level + 1]); 368fb2f18f8Sesaxe ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 369c416da2dSjb145095 ((pg->cmt_parent != NULL) && 370c416da2dSjb145095 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 371fb2f18f8Sesaxe continue; 372fb2f18f8Sesaxe } 373fb2f18f8Sesaxe 374fb2f18f8Sesaxe if ((level + 1) == nlevels) { 375fb2f18f8Sesaxe pg->cmt_parent = NULL; 376*6890d023SEric Saxe 377fb2f18f8Sesaxe pg->cmt_siblings = &lgrp->cl_pgs; 378fb2f18f8Sesaxe children = ++lgrp->cl_npgs; 379*6890d023SEric Saxe cmt_root->cl_npgs++; 380fb2f18f8Sesaxe } else { 381fb2f18f8Sesaxe pg->cmt_parent = cpu_cmt_hier[level + 1]; 382fb2f18f8Sesaxe 383fb2f18f8Sesaxe /* 384fb2f18f8Sesaxe * A good parent keeps track of their children. 385fb2f18f8Sesaxe * The parent's children group is also the PG's 386fb2f18f8Sesaxe * siblings. 387fb2f18f8Sesaxe */ 388fb2f18f8Sesaxe if (pg->cmt_parent->cmt_children == NULL) { 389fb2f18f8Sesaxe pg->cmt_parent->cmt_children = 390fb2f18f8Sesaxe kmem_zalloc(sizeof (group_t), KM_SLEEP); 391fb2f18f8Sesaxe group_create(pg->cmt_parent->cmt_children); 392fb2f18f8Sesaxe } 393fb2f18f8Sesaxe pg->cmt_siblings = pg->cmt_parent->cmt_children; 394fb2f18f8Sesaxe children = ++pg->cmt_parent->cmt_nchildren; 395fb2f18f8Sesaxe } 396*6890d023SEric Saxe 397fb2f18f8Sesaxe group_expand(pg->cmt_siblings, children); 398*6890d023SEric Saxe group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 399fb2f18f8Sesaxe } 400fb2f18f8Sesaxe 401fb2f18f8Sesaxe /* 402fb2f18f8Sesaxe * Cache the chip and core IDs in the cpu_t->cpu_physid structure 403fb2f18f8Sesaxe * for fast lookups later. 404fb2f18f8Sesaxe */ 405fb2f18f8Sesaxe if (cp->cpu_physid) { 406fb2f18f8Sesaxe cp->cpu_physid->cpu_chipid = 407fb2f18f8Sesaxe pg_plat_hw_instance_id(cp, PGHW_CHIP); 408fb2f18f8Sesaxe cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 409fb2f18f8Sesaxe 410fb2f18f8Sesaxe /* 411fb2f18f8Sesaxe * If this cpu has a PG representing shared cache, then set 412fb2f18f8Sesaxe * cpu_cacheid to that PG's logical id 413fb2f18f8Sesaxe */ 414fb2f18f8Sesaxe if (pg_cache) 415fb2f18f8Sesaxe cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 416fb2f18f8Sesaxe } 417fb2f18f8Sesaxe 418fb2f18f8Sesaxe /* CPU0 only initialization */ 419fb2f18f8Sesaxe if (is_cpu0) { 420fb2f18f8Sesaxe pg_cmt_cpu_startup(cp); 421fb2f18f8Sesaxe is_cpu0 = 0; 422a6604450Sesaxe cpu0_lgrp = lgrp; 423fb2f18f8Sesaxe } 424fb2f18f8Sesaxe 425fb2f18f8Sesaxe } 426fb2f18f8Sesaxe 427fb2f18f8Sesaxe /* 428fb2f18f8Sesaxe * Class callback when a CPU is leaving the system (deletion) 429fb2f18f8Sesaxe */ 430fb2f18f8Sesaxe static void 431fb2f18f8Sesaxe pg_cmt_cpu_fini(cpu_t *cp) 432fb2f18f8Sesaxe { 433fb2f18f8Sesaxe group_iter_t i; 434fb2f18f8Sesaxe pg_cmt_t *pg; 435fb2f18f8Sesaxe group_t *pgs, *cmt_pgs; 436fb2f18f8Sesaxe lgrp_handle_t lgrp_handle; 437fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 438fb2f18f8Sesaxe 439fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 440fb2f18f8Sesaxe cmt_pgs = &cp->cpu_pg->cmt_pgs; 441fb2f18f8Sesaxe 442fb2f18f8Sesaxe /* 443fb2f18f8Sesaxe * Find the lgroup that encapsulates this CPU's CMT hierarchy 444fb2f18f8Sesaxe */ 445fb2f18f8Sesaxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 446a6604450Sesaxe 447fb2f18f8Sesaxe lgrp = pg_cmt_find_lgrp(lgrp_handle); 448a6604450Sesaxe if (lgrp == NULL) { 449a6604450Sesaxe /* 450a6604450Sesaxe * This is a bit of a special case. 451a6604450Sesaxe * The only way this can happen is if the CPU's lgrp 452a6604450Sesaxe * handle changed out from underneath us, which is what 453a6604450Sesaxe * happens with null_proc_lpa on starcat systems. 454a6604450Sesaxe * 455a6604450Sesaxe * Use the initial boot CPU lgrp, since this is what 456a6604450Sesaxe * we need to tear down. 457a6604450Sesaxe */ 458a6604450Sesaxe lgrp = cpu0_lgrp; 459a6604450Sesaxe } 460fb2f18f8Sesaxe 461fb2f18f8Sesaxe /* 462fb2f18f8Sesaxe * First, clean up anything load balancing specific for each of 463fb2f18f8Sesaxe * the CPU's PGs that participated in CMT load balancing 464fb2f18f8Sesaxe */ 465fb2f18f8Sesaxe pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 466fb2f18f8Sesaxe while (pg != NULL) { 467fb2f18f8Sesaxe 468fb2f18f8Sesaxe /* 469fb2f18f8Sesaxe * Remove the PG from the CPU's load balancing lineage 470fb2f18f8Sesaxe */ 471fb2f18f8Sesaxe (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 472fb2f18f8Sesaxe 473fb2f18f8Sesaxe /* 474fb2f18f8Sesaxe * If it's about to become empty, destroy it's children 475fb2f18f8Sesaxe * group, and remove it's reference from it's siblings. 476fb2f18f8Sesaxe * This is done here (rather than below) to avoid removing 477fb2f18f8Sesaxe * our reference from a PG that we just eliminated. 478fb2f18f8Sesaxe */ 479fb2f18f8Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 480fb2f18f8Sesaxe if (pg->cmt_children != NULL) 481fb2f18f8Sesaxe group_destroy(pg->cmt_children); 482fb2f18f8Sesaxe if (pg->cmt_siblings != NULL) { 483fb2f18f8Sesaxe if (pg->cmt_siblings == &lgrp->cl_pgs) 484fb2f18f8Sesaxe lgrp->cl_npgs--; 485fb2f18f8Sesaxe else 486fb2f18f8Sesaxe pg->cmt_parent->cmt_nchildren--; 487fb2f18f8Sesaxe } 488fb2f18f8Sesaxe } 489fb2f18f8Sesaxe pg = pg->cmt_parent; 490fb2f18f8Sesaxe } 491fb2f18f8Sesaxe ASSERT(GROUP_SIZE(cmt_pgs) == 0); 492fb2f18f8Sesaxe 493fb2f18f8Sesaxe /* 494fb2f18f8Sesaxe * Now that the load balancing lineage updates have happened, 495fb2f18f8Sesaxe * remove the CPU from all it's PGs (destroying any that become 496fb2f18f8Sesaxe * empty). 497fb2f18f8Sesaxe */ 498fb2f18f8Sesaxe group_iter_init(&i); 499fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 500fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 501fb2f18f8Sesaxe continue; 502fb2f18f8Sesaxe 503fb2f18f8Sesaxe pg_cpu_delete((pg_t *)pg, cp); 504fb2f18f8Sesaxe /* 505fb2f18f8Sesaxe * Deleting the CPU from the PG changes the CPU's 506fb2f18f8Sesaxe * PG group over which we are actively iterating 507fb2f18f8Sesaxe * Re-initialize the iteration 508fb2f18f8Sesaxe */ 509fb2f18f8Sesaxe group_iter_init(&i); 510fb2f18f8Sesaxe 511fb2f18f8Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 512fb2f18f8Sesaxe 513fb2f18f8Sesaxe /* 514fb2f18f8Sesaxe * The PG has become zero sized, so destroy it. 515fb2f18f8Sesaxe */ 516fb2f18f8Sesaxe group_destroy(&pg->cmt_cpus_actv); 517fb2f18f8Sesaxe bitset_fini(&pg->cmt_cpus_actv_set); 518fb2f18f8Sesaxe pghw_fini((pghw_t *)pg); 519fb2f18f8Sesaxe 520fb2f18f8Sesaxe pg_destroy((pg_t *)pg); 521fb2f18f8Sesaxe } 522fb2f18f8Sesaxe } 523fb2f18f8Sesaxe } 524fb2f18f8Sesaxe 525fb2f18f8Sesaxe /* 526fb2f18f8Sesaxe * Class callback when a CPU is entering a cpu partition 527fb2f18f8Sesaxe */ 528fb2f18f8Sesaxe static void 529fb2f18f8Sesaxe pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 530fb2f18f8Sesaxe { 531fb2f18f8Sesaxe group_t *pgs; 532fb2f18f8Sesaxe pg_t *pg; 533fb2f18f8Sesaxe group_iter_t i; 534fb2f18f8Sesaxe 535fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 536fb2f18f8Sesaxe 537fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 538fb2f18f8Sesaxe 539fb2f18f8Sesaxe /* 540fb2f18f8Sesaxe * Ensure that the new partition's PG bitset 541fb2f18f8Sesaxe * is large enough for all CMT PG's to which cp 542fb2f18f8Sesaxe * belongs 543fb2f18f8Sesaxe */ 544fb2f18f8Sesaxe group_iter_init(&i); 545fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 546fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 547fb2f18f8Sesaxe continue; 548fb2f18f8Sesaxe 549fb2f18f8Sesaxe if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 550fb2f18f8Sesaxe bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 551fb2f18f8Sesaxe } 552fb2f18f8Sesaxe } 553fb2f18f8Sesaxe 554fb2f18f8Sesaxe /* 555fb2f18f8Sesaxe * Class callback when a CPU is actually moving partitions 556fb2f18f8Sesaxe */ 557fb2f18f8Sesaxe static void 558fb2f18f8Sesaxe pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 559fb2f18f8Sesaxe { 560fb2f18f8Sesaxe cpu_t *cpp; 561fb2f18f8Sesaxe group_t *pgs; 562fb2f18f8Sesaxe pg_t *pg; 563fb2f18f8Sesaxe group_iter_t pg_iter; 564fb2f18f8Sesaxe pg_cpu_itr_t cpu_iter; 565fb2f18f8Sesaxe boolean_t found; 566fb2f18f8Sesaxe 567fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 568fb2f18f8Sesaxe 569fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 570fb2f18f8Sesaxe group_iter_init(&pg_iter); 571fb2f18f8Sesaxe 572fb2f18f8Sesaxe /* 573fb2f18f8Sesaxe * Iterate over the CPUs CMT PGs 574fb2f18f8Sesaxe */ 575fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 576fb2f18f8Sesaxe 577fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 578fb2f18f8Sesaxe continue; 579fb2f18f8Sesaxe 580fb2f18f8Sesaxe /* 581fb2f18f8Sesaxe * Add the PG to the bitset in the new partition. 582fb2f18f8Sesaxe */ 583fb2f18f8Sesaxe bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 584fb2f18f8Sesaxe 585fb2f18f8Sesaxe /* 586fb2f18f8Sesaxe * Remove the PG from the bitset in the old partition 587fb2f18f8Sesaxe * if the last of the PG's CPUs have left. 588fb2f18f8Sesaxe */ 589fb2f18f8Sesaxe found = B_FALSE; 590fb2f18f8Sesaxe PG_CPU_ITR_INIT(pg, cpu_iter); 591fb2f18f8Sesaxe while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 592fb2f18f8Sesaxe if (cpp == cp) 593fb2f18f8Sesaxe continue; 594a6604450Sesaxe if (CPU_ACTIVE(cpp) && 595a6604450Sesaxe cpp->cpu_part->cp_id == oldpp->cp_id) { 596fb2f18f8Sesaxe found = B_TRUE; 597fb2f18f8Sesaxe break; 598fb2f18f8Sesaxe } 599fb2f18f8Sesaxe } 600fb2f18f8Sesaxe if (!found) 601fb2f18f8Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 602fb2f18f8Sesaxe } 603fb2f18f8Sesaxe } 604fb2f18f8Sesaxe 605fb2f18f8Sesaxe /* 606fb2f18f8Sesaxe * Class callback when a CPU becomes active (online) 607fb2f18f8Sesaxe * 608fb2f18f8Sesaxe * This is called in a context where CPUs are paused 609fb2f18f8Sesaxe */ 610fb2f18f8Sesaxe static void 611fb2f18f8Sesaxe pg_cmt_cpu_active(cpu_t *cp) 612fb2f18f8Sesaxe { 613fb2f18f8Sesaxe int err; 614fb2f18f8Sesaxe group_iter_t i; 615fb2f18f8Sesaxe pg_cmt_t *pg; 616fb2f18f8Sesaxe group_t *pgs; 617fb2f18f8Sesaxe 618fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 619fb2f18f8Sesaxe 620fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 621fb2f18f8Sesaxe group_iter_init(&i); 622fb2f18f8Sesaxe 623fb2f18f8Sesaxe /* 624fb2f18f8Sesaxe * Iterate over the CPU's PGs 625fb2f18f8Sesaxe */ 626fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 627fb2f18f8Sesaxe 628fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 629fb2f18f8Sesaxe continue; 630fb2f18f8Sesaxe 631fb2f18f8Sesaxe err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 632fb2f18f8Sesaxe ASSERT(err == 0); 633fb2f18f8Sesaxe 634fb2f18f8Sesaxe /* 635fb2f18f8Sesaxe * If this is the first active CPU in the PG, and it 636fb2f18f8Sesaxe * represents a hardware sharing relationship over which 637fb2f18f8Sesaxe * CMT load balancing is performed, add it as a candidate 638fb2f18f8Sesaxe * for balancing with it's siblings. 639fb2f18f8Sesaxe */ 640fb2f18f8Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 641d129bde2Sesaxe pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { 642fb2f18f8Sesaxe err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 643fb2f18f8Sesaxe ASSERT(err == 0); 644*6890d023SEric Saxe 645*6890d023SEric Saxe /* 646*6890d023SEric Saxe * If this is a top level PG, add it as a balancing 647*6890d023SEric Saxe * candidate when balancing within the root lgroup 648*6890d023SEric Saxe */ 649*6890d023SEric Saxe if (pg->cmt_parent == NULL) { 650*6890d023SEric Saxe err = group_add(&cmt_root->cl_pgs, pg, 651*6890d023SEric Saxe GRP_NORESIZE); 652*6890d023SEric Saxe ASSERT(err == 0); 653*6890d023SEric Saxe } 654fb2f18f8Sesaxe } 655fb2f18f8Sesaxe 656fb2f18f8Sesaxe /* 657fb2f18f8Sesaxe * Notate the CPU in the PGs active CPU bitset. 658fb2f18f8Sesaxe * Also notate the PG as being active in it's associated 659fb2f18f8Sesaxe * partition 660fb2f18f8Sesaxe */ 661fb2f18f8Sesaxe bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 662fb2f18f8Sesaxe bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 663fb2f18f8Sesaxe } 664fb2f18f8Sesaxe } 665fb2f18f8Sesaxe 666fb2f18f8Sesaxe /* 667fb2f18f8Sesaxe * Class callback when a CPU goes inactive (offline) 668fb2f18f8Sesaxe * 669fb2f18f8Sesaxe * This is called in a context where CPUs are paused 670fb2f18f8Sesaxe */ 671fb2f18f8Sesaxe static void 672fb2f18f8Sesaxe pg_cmt_cpu_inactive(cpu_t *cp) 673fb2f18f8Sesaxe { 674fb2f18f8Sesaxe int err; 675fb2f18f8Sesaxe group_t *pgs; 676fb2f18f8Sesaxe pg_cmt_t *pg; 677fb2f18f8Sesaxe cpu_t *cpp; 678fb2f18f8Sesaxe group_iter_t i; 679fb2f18f8Sesaxe pg_cpu_itr_t cpu_itr; 680fb2f18f8Sesaxe boolean_t found; 681fb2f18f8Sesaxe 682fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 683fb2f18f8Sesaxe 684fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 685fb2f18f8Sesaxe group_iter_init(&i); 686fb2f18f8Sesaxe 687fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 688fb2f18f8Sesaxe 689fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 690fb2f18f8Sesaxe continue; 691fb2f18f8Sesaxe 692fb2f18f8Sesaxe /* 693fb2f18f8Sesaxe * Remove the CPU from the CMT PGs active CPU group 694fb2f18f8Sesaxe * bitmap 695fb2f18f8Sesaxe */ 696fb2f18f8Sesaxe err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 697fb2f18f8Sesaxe ASSERT(err == 0); 698fb2f18f8Sesaxe 699fb2f18f8Sesaxe bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 700fb2f18f8Sesaxe 701fb2f18f8Sesaxe /* 702fb2f18f8Sesaxe * If there are no more active CPUs in this PG over which 703fb2f18f8Sesaxe * load was balanced, remove it as a balancing candidate. 704fb2f18f8Sesaxe */ 705fb2f18f8Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 706d129bde2Sesaxe pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { 707fb2f18f8Sesaxe err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 708fb2f18f8Sesaxe ASSERT(err == 0); 709*6890d023SEric Saxe 710*6890d023SEric Saxe if (pg->cmt_parent == NULL) { 711*6890d023SEric Saxe err = group_remove(&cmt_root->cl_pgs, pg, 712*6890d023SEric Saxe GRP_NORESIZE); 713*6890d023SEric Saxe ASSERT(err == 0); 714*6890d023SEric Saxe } 715fb2f18f8Sesaxe } 716fb2f18f8Sesaxe 717fb2f18f8Sesaxe /* 718fb2f18f8Sesaxe * Assert the number of active CPUs does not exceed 719fb2f18f8Sesaxe * the total number of CPUs in the PG 720fb2f18f8Sesaxe */ 721fb2f18f8Sesaxe ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 722fb2f18f8Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 723fb2f18f8Sesaxe 724fb2f18f8Sesaxe /* 725fb2f18f8Sesaxe * Update the PG bitset in the CPU's old partition 726fb2f18f8Sesaxe */ 727fb2f18f8Sesaxe found = B_FALSE; 728fb2f18f8Sesaxe PG_CPU_ITR_INIT(pg, cpu_itr); 729fb2f18f8Sesaxe while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 730fb2f18f8Sesaxe if (cpp == cp) 731fb2f18f8Sesaxe continue; 732a6604450Sesaxe if (CPU_ACTIVE(cpp) && 733a6604450Sesaxe cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 734fb2f18f8Sesaxe found = B_TRUE; 735fb2f18f8Sesaxe break; 736fb2f18f8Sesaxe } 737fb2f18f8Sesaxe } 738fb2f18f8Sesaxe if (!found) { 739fb2f18f8Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs, 740fb2f18f8Sesaxe ((pg_t *)pg)->pg_id); 741fb2f18f8Sesaxe } 742fb2f18f8Sesaxe } 743fb2f18f8Sesaxe } 744fb2f18f8Sesaxe 745fb2f18f8Sesaxe /* 746fb2f18f8Sesaxe * Return non-zero if the CPU belongs in the given PG 747fb2f18f8Sesaxe */ 748fb2f18f8Sesaxe static int 749fb2f18f8Sesaxe pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 750fb2f18f8Sesaxe { 751fb2f18f8Sesaxe cpu_t *pg_cpu; 752fb2f18f8Sesaxe 753fb2f18f8Sesaxe pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 754fb2f18f8Sesaxe 755fb2f18f8Sesaxe ASSERT(pg_cpu != NULL); 756fb2f18f8Sesaxe 757fb2f18f8Sesaxe /* 758fb2f18f8Sesaxe * The CPU belongs if, given the nature of the hardware sharing 759fb2f18f8Sesaxe * relationship represented by the PG, the CPU has that 760fb2f18f8Sesaxe * relationship with some other CPU already in the PG 761fb2f18f8Sesaxe */ 762fb2f18f8Sesaxe if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 763fb2f18f8Sesaxe return (1); 764fb2f18f8Sesaxe 765fb2f18f8Sesaxe return (0); 766fb2f18f8Sesaxe } 767fb2f18f8Sesaxe 768fb2f18f8Sesaxe /* 769*6890d023SEric Saxe * Hierarchy packing utility routine. The hierarchy order is preserved. 770fb2f18f8Sesaxe */ 771fb2f18f8Sesaxe static void 772*6890d023SEric Saxe pg_cmt_hier_pack(void *hier[], int sz) 773fb2f18f8Sesaxe { 774fb2f18f8Sesaxe int i, j; 775fb2f18f8Sesaxe 776fb2f18f8Sesaxe for (i = 0; i < sz; i++) { 777fb2f18f8Sesaxe if (hier[i] != NULL) 778fb2f18f8Sesaxe continue; 779fb2f18f8Sesaxe 780fb2f18f8Sesaxe for (j = i; j < sz; j++) { 781fb2f18f8Sesaxe if (hier[j] != NULL) { 782fb2f18f8Sesaxe hier[i] = hier[j]; 783fb2f18f8Sesaxe hier[j] = NULL; 784fb2f18f8Sesaxe break; 785fb2f18f8Sesaxe } 786fb2f18f8Sesaxe } 787fb2f18f8Sesaxe if (j == sz) 788fb2f18f8Sesaxe break; 789fb2f18f8Sesaxe } 790fb2f18f8Sesaxe } 791fb2f18f8Sesaxe 792fb2f18f8Sesaxe /* 793fb2f18f8Sesaxe * Return a cmt_lgrp_t * given an lgroup handle. 794fb2f18f8Sesaxe */ 795fb2f18f8Sesaxe static cmt_lgrp_t * 796fb2f18f8Sesaxe pg_cmt_find_lgrp(lgrp_handle_t hand) 797fb2f18f8Sesaxe { 798fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 799fb2f18f8Sesaxe 800fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 801fb2f18f8Sesaxe 802fb2f18f8Sesaxe lgrp = cmt_lgrps; 803fb2f18f8Sesaxe while (lgrp != NULL) { 804fb2f18f8Sesaxe if (lgrp->cl_hand == hand) 805a6604450Sesaxe break; 806fb2f18f8Sesaxe lgrp = lgrp->cl_next; 807fb2f18f8Sesaxe } 808a6604450Sesaxe return (lgrp); 809a6604450Sesaxe } 810fb2f18f8Sesaxe 811fb2f18f8Sesaxe /* 812a6604450Sesaxe * Create a cmt_lgrp_t with the specified handle. 813fb2f18f8Sesaxe */ 814a6604450Sesaxe static cmt_lgrp_t * 815a6604450Sesaxe pg_cmt_lgrp_create(lgrp_handle_t hand) 816a6604450Sesaxe { 817a6604450Sesaxe cmt_lgrp_t *lgrp; 818a6604450Sesaxe 819a6604450Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 820a6604450Sesaxe 821fb2f18f8Sesaxe lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 822fb2f18f8Sesaxe 823fb2f18f8Sesaxe lgrp->cl_hand = hand; 824fb2f18f8Sesaxe lgrp->cl_npgs = 0; 825fb2f18f8Sesaxe lgrp->cl_next = cmt_lgrps; 826fb2f18f8Sesaxe cmt_lgrps = lgrp; 827fb2f18f8Sesaxe group_create(&lgrp->cl_pgs); 828fb2f18f8Sesaxe 829fb2f18f8Sesaxe return (lgrp); 830fb2f18f8Sesaxe } 831*6890d023SEric Saxe 832*6890d023SEric Saxe /* 833*6890d023SEric Saxe * Perform multi-level CMT load balancing of running threads. 834*6890d023SEric Saxe * 835*6890d023SEric Saxe * tp is the thread being enqueued. 836*6890d023SEric Saxe * cp is a hint CPU, against which CMT load balancing will be performed. 837*6890d023SEric Saxe * 838*6890d023SEric Saxe * Returns cp, or a CPU better than cp with respect to balancing 839*6890d023SEric Saxe * running thread load. 840*6890d023SEric Saxe */ 841*6890d023SEric Saxe cpu_t * 842*6890d023SEric Saxe cmt_balance(kthread_t *tp, cpu_t *cp) 843*6890d023SEric Saxe { 844*6890d023SEric Saxe int hint, i, cpu, nsiblings; 845*6890d023SEric Saxe int self = 0; 846*6890d023SEric Saxe group_t *cmt_pgs, *siblings; 847*6890d023SEric Saxe pg_cmt_t *pg, *pg_tmp, *tpg = NULL; 848*6890d023SEric Saxe int pg_nrun, tpg_nrun; 849*6890d023SEric Saxe int level = 0; 850*6890d023SEric Saxe cpu_t *newcp; 851*6890d023SEric Saxe 852*6890d023SEric Saxe ASSERT(THREAD_LOCK_HELD(tp)); 853*6890d023SEric Saxe 854*6890d023SEric Saxe cmt_pgs = &cp->cpu_pg->cmt_pgs; 855*6890d023SEric Saxe 856*6890d023SEric Saxe if (GROUP_SIZE(cmt_pgs) == 0) 857*6890d023SEric Saxe return (cp); /* nothing to do */ 858*6890d023SEric Saxe 859*6890d023SEric Saxe if (tp == curthread) 860*6890d023SEric Saxe self = 1; 861*6890d023SEric Saxe 862*6890d023SEric Saxe /* 863*6890d023SEric Saxe * Balance across siblings in the CPUs CMT lineage 864*6890d023SEric Saxe * If the thread is homed to the root lgroup, perform 865*6890d023SEric Saxe * top level balancing against other top level PGs 866*6890d023SEric Saxe * in the system. Otherwise, start with the default 867*6890d023SEric Saxe * top level siblings group, which is within the leaf lgroup 868*6890d023SEric Saxe */ 869*6890d023SEric Saxe pg = GROUP_ACCESS(cmt_pgs, level); 870*6890d023SEric Saxe if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) 871*6890d023SEric Saxe siblings = &cmt_root->cl_pgs; 872*6890d023SEric Saxe else 873*6890d023SEric Saxe siblings = pg->cmt_siblings; 874*6890d023SEric Saxe 875*6890d023SEric Saxe /* 876*6890d023SEric Saxe * Traverse down the lineage until we find a level that needs 877*6890d023SEric Saxe * balancing, or we get to the end. 878*6890d023SEric Saxe */ 879*6890d023SEric Saxe for (;;) { 880*6890d023SEric Saxe nsiblings = GROUP_SIZE(siblings); /* self inclusive */ 881*6890d023SEric Saxe if (nsiblings == 1) 882*6890d023SEric Saxe goto next_level; 883*6890d023SEric Saxe 884*6890d023SEric Saxe pg_nrun = pg->cmt_nrunning; 885*6890d023SEric Saxe if (self && 886*6890d023SEric Saxe bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid)) 887*6890d023SEric Saxe pg_nrun--; /* Ignore curthread's effect */ 888*6890d023SEric Saxe 889*6890d023SEric Saxe hint = CPU_PSEUDO_RANDOM() % nsiblings; 890*6890d023SEric Saxe 891*6890d023SEric Saxe /* 892*6890d023SEric Saxe * Find a balancing candidate from among our siblings 893*6890d023SEric Saxe * "hint" is a hint for where to start looking 894*6890d023SEric Saxe */ 895*6890d023SEric Saxe i = hint; 896*6890d023SEric Saxe do { 897*6890d023SEric Saxe ASSERT(i < nsiblings); 898*6890d023SEric Saxe pg_tmp = GROUP_ACCESS(siblings, i); 899*6890d023SEric Saxe 900*6890d023SEric Saxe /* 901*6890d023SEric Saxe * The candidate must not be us, and must 902*6890d023SEric Saxe * have some CPU resources in the thread's 903*6890d023SEric Saxe * partition 904*6890d023SEric Saxe */ 905*6890d023SEric Saxe if (pg_tmp != pg && 906*6890d023SEric Saxe bitset_in_set(&tp->t_cpupart->cp_cmt_pgs, 907*6890d023SEric Saxe ((pg_t *)pg_tmp)->pg_id)) { 908*6890d023SEric Saxe tpg = pg_tmp; 909*6890d023SEric Saxe break; 910*6890d023SEric Saxe } 911*6890d023SEric Saxe 912*6890d023SEric Saxe if (++i >= nsiblings) 913*6890d023SEric Saxe i = 0; 914*6890d023SEric Saxe } while (i != hint); 915*6890d023SEric Saxe 916*6890d023SEric Saxe if (!tpg) 917*6890d023SEric Saxe goto next_level; /* no candidates at this level */ 918*6890d023SEric Saxe 919*6890d023SEric Saxe /* 920*6890d023SEric Saxe * Check if the balancing target is underloaded 921*6890d023SEric Saxe * Decide to balance if the target is running fewer 922*6890d023SEric Saxe * threads, or if it's running the same number of threads 923*6890d023SEric Saxe * with more online CPUs 924*6890d023SEric Saxe */ 925*6890d023SEric Saxe tpg_nrun = tpg->cmt_nrunning; 926*6890d023SEric Saxe if (pg_nrun > tpg_nrun || 927*6890d023SEric Saxe (pg_nrun == tpg_nrun && 928*6890d023SEric Saxe (GROUP_SIZE(&tpg->cmt_cpus_actv) > 929*6890d023SEric Saxe GROUP_SIZE(&pg->cmt_cpus_actv)))) { 930*6890d023SEric Saxe break; 931*6890d023SEric Saxe } 932*6890d023SEric Saxe tpg = NULL; 933*6890d023SEric Saxe 934*6890d023SEric Saxe next_level: 935*6890d023SEric Saxe if (++level == GROUP_SIZE(cmt_pgs)) 936*6890d023SEric Saxe break; 937*6890d023SEric Saxe 938*6890d023SEric Saxe pg = GROUP_ACCESS(cmt_pgs, level); 939*6890d023SEric Saxe siblings = pg->cmt_siblings; 940*6890d023SEric Saxe } 941*6890d023SEric Saxe 942*6890d023SEric Saxe if (tpg) { 943*6890d023SEric Saxe uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv); 944*6890d023SEric Saxe 945*6890d023SEric Saxe /* 946*6890d023SEric Saxe * Select an idle CPU from the target 947*6890d023SEric Saxe */ 948*6890d023SEric Saxe hint = CPU_PSEUDO_RANDOM() % tgt_size; 949*6890d023SEric Saxe cpu = hint; 950*6890d023SEric Saxe do { 951*6890d023SEric Saxe newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu); 952*6890d023SEric Saxe if (newcp->cpu_part == tp->t_cpupart && 953*6890d023SEric Saxe newcp->cpu_dispatch_pri == -1) { 954*6890d023SEric Saxe cp = newcp; 955*6890d023SEric Saxe break; 956*6890d023SEric Saxe } 957*6890d023SEric Saxe if (++cpu == tgt_size) 958*6890d023SEric Saxe cpu = 0; 959*6890d023SEric Saxe } while (cpu != hint); 960*6890d023SEric Saxe } 961*6890d023SEric Saxe 962*6890d023SEric Saxe return (cp); 963*6890d023SEric Saxe } 964