1fb2f18f8Sesaxe /* 2fb2f18f8Sesaxe * CDDL HEADER START 3fb2f18f8Sesaxe * 4fb2f18f8Sesaxe * The contents of this file are subject to the terms of the 5fb2f18f8Sesaxe * Common Development and Distribution License (the "License"). 6fb2f18f8Sesaxe * You may not use this file except in compliance with the License. 7fb2f18f8Sesaxe * 8fb2f18f8Sesaxe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fb2f18f8Sesaxe * or http://www.opensolaris.org/os/licensing. 10fb2f18f8Sesaxe * See the License for the specific language governing permissions 11fb2f18f8Sesaxe * and limitations under the License. 12fb2f18f8Sesaxe * 13fb2f18f8Sesaxe * When distributing Covered Code, include this CDDL HEADER in each 14fb2f18f8Sesaxe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fb2f18f8Sesaxe * If applicable, add the following below this CDDL HEADER, with the 16fb2f18f8Sesaxe * fields enclosed by brackets "[]" replaced with your own identifying 17fb2f18f8Sesaxe * information: Portions Copyright [yyyy] [name of copyright owner] 18fb2f18f8Sesaxe * 19fb2f18f8Sesaxe * CDDL HEADER END 20fb2f18f8Sesaxe */ 21fb2f18f8Sesaxe /* 22*3e81cacfSEric Saxe * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23fb2f18f8Sesaxe * Use is subject to license terms. 24fb2f18f8Sesaxe */ 25fb2f18f8Sesaxe 26fb2f18f8Sesaxe #include <sys/systm.h> 27fb2f18f8Sesaxe #include <sys/types.h> 28fb2f18f8Sesaxe #include <sys/param.h> 29fb2f18f8Sesaxe #include <sys/thread.h> 30fb2f18f8Sesaxe #include <sys/cpuvar.h> 31fb2f18f8Sesaxe #include <sys/cpupart.h> 32fb2f18f8Sesaxe #include <sys/kmem.h> 33fb2f18f8Sesaxe #include <sys/cmn_err.h> 34fb2f18f8Sesaxe #include <sys/kstat.h> 35fb2f18f8Sesaxe #include <sys/processor.h> 36fb2f18f8Sesaxe #include <sys/disp.h> 37fb2f18f8Sesaxe #include <sys/group.h> 38fb2f18f8Sesaxe #include <sys/pghw.h> 39fb2f18f8Sesaxe #include <sys/bitset.h> 40fb2f18f8Sesaxe #include <sys/lgrp.h> 41fb2f18f8Sesaxe #include <sys/cmt.h> 42fb2f18f8Sesaxe 43fb2f18f8Sesaxe /* 44fb2f18f8Sesaxe * CMT scheduler / dispatcher support 45fb2f18f8Sesaxe * 46fb2f18f8Sesaxe * This file implements CMT scheduler support using Processor Groups. 47fb2f18f8Sesaxe * The CMT processor group class creates and maintains the CMT class 48fb2f18f8Sesaxe * specific processor group pg_cmt_t. 49fb2f18f8Sesaxe * 50fb2f18f8Sesaxe * ---------------------------- <-- pg_cmt_t * 51fb2f18f8Sesaxe * | pghw_t | 52fb2f18f8Sesaxe * ---------------------------- 53fb2f18f8Sesaxe * | CMT class specific data | 54fb2f18f8Sesaxe * | - hierarchy linkage | 55fb2f18f8Sesaxe * | - CMT load balancing data| 56fb2f18f8Sesaxe * | - active CPU group/bitset| 57fb2f18f8Sesaxe * ---------------------------- 58fb2f18f8Sesaxe * 59fb2f18f8Sesaxe * The scheduler/dispatcher leverages knowledge of the performance 60fb2f18f8Sesaxe * relevant CMT sharing relationships existing between cpus to implement 61fb2f18f8Sesaxe * optimized affinity and load balancing policies. 62fb2f18f8Sesaxe * 63fb2f18f8Sesaxe * Load balancing policy seeks to improve performance by minimizing 64fb2f18f8Sesaxe * contention over shared processor resources / facilities, while the 65fb2f18f8Sesaxe * affinity policies seek to improve cache and TLB utilization. 66fb2f18f8Sesaxe * 67fb2f18f8Sesaxe * The CMT PGs created by this class are already arranged into a 68fb2f18f8Sesaxe * hierarchy (which is done in the pghw layer). To implement the top-down 69fb2f18f8Sesaxe * CMT load balancing algorithm, the CMT PGs additionally maintain 70fb2f18f8Sesaxe * parent, child and sibling hierarchy relationships. 71fb2f18f8Sesaxe * Parent PGs always contain a superset of their children(s) resources, 72fb2f18f8Sesaxe * each PG can have at most one parent, and siblings are the group of PGs 73fb2f18f8Sesaxe * sharing the same parent. 74fb2f18f8Sesaxe * 75fb2f18f8Sesaxe * On NUMA systems, the CMT load balancing algorithm balances across the 76fb2f18f8Sesaxe * CMT PGs within their respective lgroups. On UMA based system, there 77fb2f18f8Sesaxe * exists a top level group of PGs to balance across. On NUMA systems multiple 78fb2f18f8Sesaxe * top level groups are instantiated, where the top level balancing begins by 79fb2f18f8Sesaxe * balancng across the CMT PGs within their respective (per lgroup) top level 80fb2f18f8Sesaxe * groups. 81fb2f18f8Sesaxe */ 82fb2f18f8Sesaxe typedef struct cmt_lgrp { 83fb2f18f8Sesaxe group_t cl_pgs; /* Top level group of active CMT PGs */ 84fb2f18f8Sesaxe int cl_npgs; /* # of top level PGs in the lgroup */ 85fb2f18f8Sesaxe lgrp_handle_t cl_hand; /* lgroup's platform handle */ 86fb2f18f8Sesaxe struct cmt_lgrp *cl_next; /* next cmt_lgrp */ 87fb2f18f8Sesaxe } cmt_lgrp_t; 88fb2f18f8Sesaxe 89a6604450Sesaxe static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */ 90a6604450Sesaxe static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */ 91a6604450Sesaxe /* used for null_proc_lpa */ 926890d023SEric Saxe static cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */ 93fb2f18f8Sesaxe 94a6604450Sesaxe static int is_cpu0 = 1; /* true if this is boot CPU context */ 95a6604450Sesaxe 96a6604450Sesaxe /* 97a6604450Sesaxe * Set this to non-zero to disable CMT scheduling 98a6604450Sesaxe * This must be done via kmdb -d, as /etc/system will be too late 99a6604450Sesaxe */ 100fb2f18f8Sesaxe static int cmt_sched_disabled = 0; 101fb2f18f8Sesaxe 102fb2f18f8Sesaxe static pg_cid_t pg_cmt_class_id; /* PG class id */ 103fb2f18f8Sesaxe 104fb2f18f8Sesaxe static pg_t *pg_cmt_alloc(); 105fb2f18f8Sesaxe static void pg_cmt_free(pg_t *); 106fb2f18f8Sesaxe static void pg_cmt_cpu_init(cpu_t *); 107fb2f18f8Sesaxe static void pg_cmt_cpu_fini(cpu_t *); 108fb2f18f8Sesaxe static void pg_cmt_cpu_active(cpu_t *); 109fb2f18f8Sesaxe static void pg_cmt_cpu_inactive(cpu_t *); 110fb2f18f8Sesaxe static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *); 111fb2f18f8Sesaxe static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *); 1126890d023SEric Saxe static void pg_cmt_hier_pack(void **, int); 113fb2f18f8Sesaxe static int pg_cmt_cpu_belongs(pg_t *, cpu_t *); 114fb2f18f8Sesaxe static int pg_cmt_hw(pghw_type_t); 115fb2f18f8Sesaxe static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t); 116a6604450Sesaxe static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t); 117fb2f18f8Sesaxe 118fb2f18f8Sesaxe /* 119fb2f18f8Sesaxe * Macro to test if PG is managed by the CMT PG class 120fb2f18f8Sesaxe */ 121fb2f18f8Sesaxe #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id) 122fb2f18f8Sesaxe 123fb2f18f8Sesaxe /* 124fb2f18f8Sesaxe * CMT PG ops 125fb2f18f8Sesaxe */ 126fb2f18f8Sesaxe struct pg_ops pg_ops_cmt = { 127fb2f18f8Sesaxe pg_cmt_alloc, 128fb2f18f8Sesaxe pg_cmt_free, 129fb2f18f8Sesaxe pg_cmt_cpu_init, 130fb2f18f8Sesaxe pg_cmt_cpu_fini, 131fb2f18f8Sesaxe pg_cmt_cpu_active, 132fb2f18f8Sesaxe pg_cmt_cpu_inactive, 133fb2f18f8Sesaxe pg_cmt_cpupart_in, 134fb2f18f8Sesaxe NULL, /* cpupart_out */ 135fb2f18f8Sesaxe pg_cmt_cpupart_move, 136fb2f18f8Sesaxe pg_cmt_cpu_belongs, 137fb2f18f8Sesaxe }; 138fb2f18f8Sesaxe 139fb2f18f8Sesaxe /* 140fb2f18f8Sesaxe * Initialize the CMT PG class 141fb2f18f8Sesaxe */ 142fb2f18f8Sesaxe void 143fb2f18f8Sesaxe pg_cmt_class_init(void) 144fb2f18f8Sesaxe { 145fb2f18f8Sesaxe if (cmt_sched_disabled) 146fb2f18f8Sesaxe return; 147fb2f18f8Sesaxe 148fb2f18f8Sesaxe pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL); 149fb2f18f8Sesaxe } 150fb2f18f8Sesaxe 151fb2f18f8Sesaxe /* 152fb2f18f8Sesaxe * Called to indicate a new CPU has started up so 153fb2f18f8Sesaxe * that either t0 or the slave startup thread can 154fb2f18f8Sesaxe * be accounted for. 155fb2f18f8Sesaxe */ 156fb2f18f8Sesaxe void 157fb2f18f8Sesaxe pg_cmt_cpu_startup(cpu_t *cp) 158fb2f18f8Sesaxe { 159fb2f18f8Sesaxe PG_NRUN_UPDATE(cp, 1); 160fb2f18f8Sesaxe } 161fb2f18f8Sesaxe 162fb2f18f8Sesaxe /* 163fb2f18f8Sesaxe * Adjust the CMT load in the CMT PGs in which the CPU belongs 164fb2f18f8Sesaxe * Note that "n" can be positive in the case of increasing 165fb2f18f8Sesaxe * load, or negative in the case of decreasing load. 166fb2f18f8Sesaxe */ 167fb2f18f8Sesaxe void 168fb2f18f8Sesaxe pg_cmt_load(cpu_t *cp, int n) 169fb2f18f8Sesaxe { 170fb2f18f8Sesaxe pg_cmt_t *pg; 171fb2f18f8Sesaxe 172fb2f18f8Sesaxe pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 173fb2f18f8Sesaxe while (pg != NULL) { 174fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg)); 175fb2f18f8Sesaxe atomic_add_32(&pg->cmt_nrunning, n); 176fb2f18f8Sesaxe pg = pg->cmt_parent; 177fb2f18f8Sesaxe } 178fb2f18f8Sesaxe } 179fb2f18f8Sesaxe 180fb2f18f8Sesaxe /* 181fb2f18f8Sesaxe * Return non-zero if thread can migrate between "from" and "to" 182fb2f18f8Sesaxe * without a performance penalty 183fb2f18f8Sesaxe */ 184fb2f18f8Sesaxe int 185fb2f18f8Sesaxe pg_cmt_can_migrate(cpu_t *from, cpu_t *to) 186fb2f18f8Sesaxe { 187fb2f18f8Sesaxe if (from->cpu_physid->cpu_cacheid == 188fb2f18f8Sesaxe to->cpu_physid->cpu_cacheid) 189fb2f18f8Sesaxe return (1); 190fb2f18f8Sesaxe return (0); 191fb2f18f8Sesaxe } 192fb2f18f8Sesaxe 193fb2f18f8Sesaxe /* 194fb2f18f8Sesaxe * CMT class specific PG allocation 195fb2f18f8Sesaxe */ 196fb2f18f8Sesaxe static pg_t * 197fb2f18f8Sesaxe pg_cmt_alloc(void) 198fb2f18f8Sesaxe { 199fb2f18f8Sesaxe return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP)); 200fb2f18f8Sesaxe } 201fb2f18f8Sesaxe 202fb2f18f8Sesaxe /* 203fb2f18f8Sesaxe * Class specific PG de-allocation 204fb2f18f8Sesaxe */ 205fb2f18f8Sesaxe static void 206fb2f18f8Sesaxe pg_cmt_free(pg_t *pg) 207fb2f18f8Sesaxe { 208fb2f18f8Sesaxe ASSERT(pg != NULL); 209fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg)); 210fb2f18f8Sesaxe 211fb2f18f8Sesaxe kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t)); 212fb2f18f8Sesaxe } 213fb2f18f8Sesaxe 214fb2f18f8Sesaxe /* 215fb2f18f8Sesaxe * Return 1 if CMT scheduling policies should be impelmented 216fb2f18f8Sesaxe * for the specified hardware sharing relationship. 217fb2f18f8Sesaxe */ 218fb2f18f8Sesaxe static int 219fb2f18f8Sesaxe pg_cmt_hw(pghw_type_t hw) 220fb2f18f8Sesaxe { 221d129bde2Sesaxe return (pg_plat_cmt_load_bal_hw(hw) || 222d129bde2Sesaxe pg_plat_cmt_affinity_hw(hw)); 223fb2f18f8Sesaxe } 224fb2f18f8Sesaxe 225fb2f18f8Sesaxe /* 226fb2f18f8Sesaxe * CMT class callback for a new CPU entering the system 227fb2f18f8Sesaxe */ 228fb2f18f8Sesaxe static void 229fb2f18f8Sesaxe pg_cmt_cpu_init(cpu_t *cp) 230fb2f18f8Sesaxe { 231fb2f18f8Sesaxe pg_cmt_t *pg; 232fb2f18f8Sesaxe group_t *cmt_pgs; 233fb2f18f8Sesaxe int level, max_level, nlevels; 234fb2f18f8Sesaxe pghw_type_t hw; 235fb2f18f8Sesaxe pg_t *pg_cache = NULL; 236fb2f18f8Sesaxe pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS]; 237fb2f18f8Sesaxe lgrp_handle_t lgrp_handle; 238fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 239fb2f18f8Sesaxe 240fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 241fb2f18f8Sesaxe 242fb2f18f8Sesaxe /* 243fb2f18f8Sesaxe * A new CPU is coming into the system. 244fb2f18f8Sesaxe * Interrogate the platform to see if the CPU 245fb2f18f8Sesaxe * has any performance relevant CMT sharing 246fb2f18f8Sesaxe * relationships 247fb2f18f8Sesaxe */ 248fb2f18f8Sesaxe cmt_pgs = &cp->cpu_pg->cmt_pgs; 249fb2f18f8Sesaxe cp->cpu_pg->cmt_lineage = NULL; 250fb2f18f8Sesaxe 251fb2f18f8Sesaxe bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier)); 252fb2f18f8Sesaxe max_level = nlevels = 0; 253fb2f18f8Sesaxe for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) { 254fb2f18f8Sesaxe 255fb2f18f8Sesaxe /* 256fb2f18f8Sesaxe * We're only interested in CMT hw sharing relationships 257fb2f18f8Sesaxe */ 258fb2f18f8Sesaxe if (pg_cmt_hw(hw) == 0 || pg_plat_hw_shared(cp, hw) == 0) 259fb2f18f8Sesaxe continue; 260fb2f18f8Sesaxe 261fb2f18f8Sesaxe /* 262fb2f18f8Sesaxe * Find (or create) the PG associated with 263fb2f18f8Sesaxe * the hw sharing relationship in which cp 264fb2f18f8Sesaxe * belongs. 265fb2f18f8Sesaxe * 266fb2f18f8Sesaxe * Determine if a suitable PG already 267fb2f18f8Sesaxe * exists, or if one needs to be created. 268fb2f18f8Sesaxe */ 269fb2f18f8Sesaxe pg = (pg_cmt_t *)pghw_place_cpu(cp, hw); 270fb2f18f8Sesaxe if (pg == NULL) { 271fb2f18f8Sesaxe /* 272fb2f18f8Sesaxe * Create a new one. 273fb2f18f8Sesaxe * Initialize the common... 274fb2f18f8Sesaxe */ 275fb2f18f8Sesaxe pg = (pg_cmt_t *)pg_create(pg_cmt_class_id); 276fb2f18f8Sesaxe 277fb2f18f8Sesaxe /* ... physical ... */ 278fb2f18f8Sesaxe pghw_init((pghw_t *)pg, cp, hw); 279fb2f18f8Sesaxe 280fb2f18f8Sesaxe /* 281fb2f18f8Sesaxe * ... and CMT specific portions of the 282fb2f18f8Sesaxe * structure. 283fb2f18f8Sesaxe */ 284fb2f18f8Sesaxe bitset_init(&pg->cmt_cpus_actv_set); 285fb2f18f8Sesaxe group_create(&pg->cmt_cpus_actv); 286fb2f18f8Sesaxe } else { 287fb2f18f8Sesaxe ASSERT(IS_CMT_PG(pg)); 288fb2f18f8Sesaxe } 289fb2f18f8Sesaxe 290fb2f18f8Sesaxe /* Add the CPU to the PG */ 291fb2f18f8Sesaxe pg_cpu_add((pg_t *)pg, cp); 292fb2f18f8Sesaxe 293fb2f18f8Sesaxe /* 2946890d023SEric Saxe * Ensure capacity of the active CPU group/bitset 295fb2f18f8Sesaxe */ 296fb2f18f8Sesaxe group_expand(&pg->cmt_cpus_actv, 297fb2f18f8Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 298fb2f18f8Sesaxe 299fb2f18f8Sesaxe if (cp->cpu_seqid >= 300fb2f18f8Sesaxe bitset_capacity(&pg->cmt_cpus_actv_set)) { 301fb2f18f8Sesaxe bitset_resize(&pg->cmt_cpus_actv_set, 302fb2f18f8Sesaxe cp->cpu_seqid + 1); 303fb2f18f8Sesaxe } 304fb2f18f8Sesaxe 305fb2f18f8Sesaxe /* 306fb2f18f8Sesaxe * Build a lineage of CMT PGs for load balancing 307fb2f18f8Sesaxe */ 308d129bde2Sesaxe if (pg_plat_cmt_load_bal_hw(hw)) { 309fb2f18f8Sesaxe level = pghw_level(hw); 310fb2f18f8Sesaxe cpu_cmt_hier[level] = pg; 311fb2f18f8Sesaxe if (level > max_level) 312fb2f18f8Sesaxe max_level = level; 313fb2f18f8Sesaxe nlevels++; 314fb2f18f8Sesaxe } 315fb2f18f8Sesaxe 316fb2f18f8Sesaxe /* Cache this for later */ 317fb2f18f8Sesaxe if (hw == PGHW_CACHE) 318fb2f18f8Sesaxe pg_cache = (pg_t *)pg; 319fb2f18f8Sesaxe } 320fb2f18f8Sesaxe 321fb2f18f8Sesaxe /* 3226890d023SEric Saxe * Pack out any gaps in the constructed lineage, 3236890d023SEric Saxe * then size it out. 3246890d023SEric Saxe * 325fb2f18f8Sesaxe * Gaps may exist where the architecture knows 326fb2f18f8Sesaxe * about a hardware sharing relationship, but such a 327fb2f18f8Sesaxe * relationship either isn't relevant for load 328fb2f18f8Sesaxe * balancing or doesn't exist between CPUs on the system. 329fb2f18f8Sesaxe */ 3306890d023SEric Saxe pg_cmt_hier_pack((void **)cpu_cmt_hier, max_level + 1); 3316890d023SEric Saxe group_expand(cmt_pgs, nlevels); 3326890d023SEric Saxe 3336890d023SEric Saxe 3346890d023SEric Saxe if (cmt_root == NULL) 3356890d023SEric Saxe cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand()); 336fb2f18f8Sesaxe 337fb2f18f8Sesaxe /* 3386890d023SEric Saxe * Find the lgrp that encapsulates this CPU's CMT hierarchy. 3396890d023SEric Saxe * and locate/create a suitable cmt_lgrp_t. 3406890d023SEric Saxe */ 3416890d023SEric Saxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 3426890d023SEric Saxe if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL) 3436890d023SEric Saxe lgrp = pg_cmt_lgrp_create(lgrp_handle); 3446890d023SEric Saxe 3456890d023SEric Saxe /* 3466890d023SEric Saxe * For each of the PGs in the CPU's lineage: 3476890d023SEric Saxe * - Add an entry in the CPU's CMT PG group 3486890d023SEric Saxe * which is used by the dispatcher to implement load balancing 3496890d023SEric Saxe * policy. 350fb2f18f8Sesaxe * - Tie the PG into the CMT hierarchy by connecting 351fb2f18f8Sesaxe * it to it's parent and siblings. 352fb2f18f8Sesaxe */ 353fb2f18f8Sesaxe for (level = 0; level < nlevels; level++) { 354fb2f18f8Sesaxe uint_t children; 355fb2f18f8Sesaxe int err; 356fb2f18f8Sesaxe 357fb2f18f8Sesaxe pg = cpu_cmt_hier[level]; 358fb2f18f8Sesaxe err = group_add_at(cmt_pgs, pg, nlevels - level - 1); 359fb2f18f8Sesaxe ASSERT(err == 0); 360fb2f18f8Sesaxe 361fb2f18f8Sesaxe if (level == 0) 362fb2f18f8Sesaxe cp->cpu_pg->cmt_lineage = (pg_t *)pg; 363fb2f18f8Sesaxe 364fb2f18f8Sesaxe if (pg->cmt_siblings != NULL) { 365fb2f18f8Sesaxe /* Already initialized */ 366fb2f18f8Sesaxe ASSERT(pg->cmt_parent == NULL || 367fb2f18f8Sesaxe pg->cmt_parent == cpu_cmt_hier[level + 1]); 368fb2f18f8Sesaxe ASSERT(pg->cmt_siblings == &lgrp->cl_pgs || 369c416da2dSjb145095 ((pg->cmt_parent != NULL) && 370c416da2dSjb145095 pg->cmt_siblings == pg->cmt_parent->cmt_children)); 371fb2f18f8Sesaxe continue; 372fb2f18f8Sesaxe } 373fb2f18f8Sesaxe 374fb2f18f8Sesaxe if ((level + 1) == nlevels) { 375fb2f18f8Sesaxe pg->cmt_parent = NULL; 3766890d023SEric Saxe 377fb2f18f8Sesaxe pg->cmt_siblings = &lgrp->cl_pgs; 378fb2f18f8Sesaxe children = ++lgrp->cl_npgs; 3796890d023SEric Saxe cmt_root->cl_npgs++; 380fb2f18f8Sesaxe } else { 381fb2f18f8Sesaxe pg->cmt_parent = cpu_cmt_hier[level + 1]; 382fb2f18f8Sesaxe 383fb2f18f8Sesaxe /* 384fb2f18f8Sesaxe * A good parent keeps track of their children. 385fb2f18f8Sesaxe * The parent's children group is also the PG's 386fb2f18f8Sesaxe * siblings. 387fb2f18f8Sesaxe */ 388fb2f18f8Sesaxe if (pg->cmt_parent->cmt_children == NULL) { 389fb2f18f8Sesaxe pg->cmt_parent->cmt_children = 390fb2f18f8Sesaxe kmem_zalloc(sizeof (group_t), KM_SLEEP); 391fb2f18f8Sesaxe group_create(pg->cmt_parent->cmt_children); 392fb2f18f8Sesaxe } 393fb2f18f8Sesaxe pg->cmt_siblings = pg->cmt_parent->cmt_children; 394fb2f18f8Sesaxe children = ++pg->cmt_parent->cmt_nchildren; 395fb2f18f8Sesaxe } 3966890d023SEric Saxe 397fb2f18f8Sesaxe group_expand(pg->cmt_siblings, children); 3986890d023SEric Saxe group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs); 399fb2f18f8Sesaxe } 400fb2f18f8Sesaxe 401fb2f18f8Sesaxe /* 402fb2f18f8Sesaxe * Cache the chip and core IDs in the cpu_t->cpu_physid structure 403fb2f18f8Sesaxe * for fast lookups later. 404fb2f18f8Sesaxe */ 405fb2f18f8Sesaxe if (cp->cpu_physid) { 406fb2f18f8Sesaxe cp->cpu_physid->cpu_chipid = 407fb2f18f8Sesaxe pg_plat_hw_instance_id(cp, PGHW_CHIP); 408fb2f18f8Sesaxe cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp); 409fb2f18f8Sesaxe 410fb2f18f8Sesaxe /* 411fb2f18f8Sesaxe * If this cpu has a PG representing shared cache, then set 412fb2f18f8Sesaxe * cpu_cacheid to that PG's logical id 413fb2f18f8Sesaxe */ 414fb2f18f8Sesaxe if (pg_cache) 415fb2f18f8Sesaxe cp->cpu_physid->cpu_cacheid = pg_cache->pg_id; 416fb2f18f8Sesaxe } 417fb2f18f8Sesaxe 418fb2f18f8Sesaxe /* CPU0 only initialization */ 419fb2f18f8Sesaxe if (is_cpu0) { 420fb2f18f8Sesaxe pg_cmt_cpu_startup(cp); 421fb2f18f8Sesaxe is_cpu0 = 0; 422a6604450Sesaxe cpu0_lgrp = lgrp; 423fb2f18f8Sesaxe } 424fb2f18f8Sesaxe 425fb2f18f8Sesaxe } 426fb2f18f8Sesaxe 427fb2f18f8Sesaxe /* 428fb2f18f8Sesaxe * Class callback when a CPU is leaving the system (deletion) 429fb2f18f8Sesaxe */ 430fb2f18f8Sesaxe static void 431fb2f18f8Sesaxe pg_cmt_cpu_fini(cpu_t *cp) 432fb2f18f8Sesaxe { 433fb2f18f8Sesaxe group_iter_t i; 434fb2f18f8Sesaxe pg_cmt_t *pg; 435fb2f18f8Sesaxe group_t *pgs, *cmt_pgs; 436fb2f18f8Sesaxe lgrp_handle_t lgrp_handle; 437fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 438fb2f18f8Sesaxe 439fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 440fb2f18f8Sesaxe cmt_pgs = &cp->cpu_pg->cmt_pgs; 441fb2f18f8Sesaxe 442fb2f18f8Sesaxe /* 443fb2f18f8Sesaxe * Find the lgroup that encapsulates this CPU's CMT hierarchy 444fb2f18f8Sesaxe */ 445fb2f18f8Sesaxe lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id); 446a6604450Sesaxe 447fb2f18f8Sesaxe lgrp = pg_cmt_find_lgrp(lgrp_handle); 448*3e81cacfSEric Saxe if (ncpus == 1 && lgrp != cpu0_lgrp) { 449a6604450Sesaxe /* 450*3e81cacfSEric Saxe * One might wonder how we could be deconfiguring the 451*3e81cacfSEric Saxe * only CPU in the system. 452a6604450Sesaxe * 453*3e81cacfSEric Saxe * On Starcat systems when null_proc_lpa is detected, 454*3e81cacfSEric Saxe * the boot CPU (which is already configured into a leaf 455*3e81cacfSEric Saxe * lgroup), is moved into the root lgroup. This is done by 456*3e81cacfSEric Saxe * deconfiguring it from both lgroups and processor 457*3e81cacfSEric Saxe * groups), and then later reconfiguring it back in. This 458*3e81cacfSEric Saxe * call to pg_cmt_cpu_fini() is part of that deconfiguration. 459*3e81cacfSEric Saxe * 460*3e81cacfSEric Saxe * This special case is detected by noting that the platform 461*3e81cacfSEric Saxe * has changed the CPU's lgrp affiliation (since it now 462*3e81cacfSEric Saxe * belongs in the root). In this case, use the cmt_lgrp_t 463*3e81cacfSEric Saxe * cached for the boot CPU, since this is what needs to be 464*3e81cacfSEric Saxe * torn down. 465a6604450Sesaxe */ 466a6604450Sesaxe lgrp = cpu0_lgrp; 467a6604450Sesaxe } 468fb2f18f8Sesaxe 469*3e81cacfSEric Saxe ASSERT(lgrp != NULL); 470*3e81cacfSEric Saxe 471fb2f18f8Sesaxe /* 472fb2f18f8Sesaxe * First, clean up anything load balancing specific for each of 473fb2f18f8Sesaxe * the CPU's PGs that participated in CMT load balancing 474fb2f18f8Sesaxe */ 475fb2f18f8Sesaxe pg = (pg_cmt_t *)cp->cpu_pg->cmt_lineage; 476fb2f18f8Sesaxe while (pg != NULL) { 477fb2f18f8Sesaxe 478fb2f18f8Sesaxe /* 479fb2f18f8Sesaxe * Remove the PG from the CPU's load balancing lineage 480fb2f18f8Sesaxe */ 481fb2f18f8Sesaxe (void) group_remove(cmt_pgs, pg, GRP_RESIZE); 482fb2f18f8Sesaxe 483fb2f18f8Sesaxe /* 484fb2f18f8Sesaxe * If it's about to become empty, destroy it's children 485fb2f18f8Sesaxe * group, and remove it's reference from it's siblings. 486fb2f18f8Sesaxe * This is done here (rather than below) to avoid removing 487fb2f18f8Sesaxe * our reference from a PG that we just eliminated. 488fb2f18f8Sesaxe */ 489fb2f18f8Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) { 490fb2f18f8Sesaxe if (pg->cmt_children != NULL) 491fb2f18f8Sesaxe group_destroy(pg->cmt_children); 492fb2f18f8Sesaxe if (pg->cmt_siblings != NULL) { 493fb2f18f8Sesaxe if (pg->cmt_siblings == &lgrp->cl_pgs) 494fb2f18f8Sesaxe lgrp->cl_npgs--; 495fb2f18f8Sesaxe else 496fb2f18f8Sesaxe pg->cmt_parent->cmt_nchildren--; 497fb2f18f8Sesaxe } 498fb2f18f8Sesaxe } 499fb2f18f8Sesaxe pg = pg->cmt_parent; 500fb2f18f8Sesaxe } 501fb2f18f8Sesaxe ASSERT(GROUP_SIZE(cmt_pgs) == 0); 502fb2f18f8Sesaxe 503fb2f18f8Sesaxe /* 504fb2f18f8Sesaxe * Now that the load balancing lineage updates have happened, 505fb2f18f8Sesaxe * remove the CPU from all it's PGs (destroying any that become 506fb2f18f8Sesaxe * empty). 507fb2f18f8Sesaxe */ 508fb2f18f8Sesaxe group_iter_init(&i); 509fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 510fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 511fb2f18f8Sesaxe continue; 512fb2f18f8Sesaxe 513fb2f18f8Sesaxe pg_cpu_delete((pg_t *)pg, cp); 514fb2f18f8Sesaxe /* 515fb2f18f8Sesaxe * Deleting the CPU from the PG changes the CPU's 516fb2f18f8Sesaxe * PG group over which we are actively iterating 517fb2f18f8Sesaxe * Re-initialize the iteration 518fb2f18f8Sesaxe */ 519fb2f18f8Sesaxe group_iter_init(&i); 520fb2f18f8Sesaxe 521fb2f18f8Sesaxe if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) { 522fb2f18f8Sesaxe 523fb2f18f8Sesaxe /* 524fb2f18f8Sesaxe * The PG has become zero sized, so destroy it. 525fb2f18f8Sesaxe */ 526fb2f18f8Sesaxe group_destroy(&pg->cmt_cpus_actv); 527fb2f18f8Sesaxe bitset_fini(&pg->cmt_cpus_actv_set); 528fb2f18f8Sesaxe pghw_fini((pghw_t *)pg); 529fb2f18f8Sesaxe 530fb2f18f8Sesaxe pg_destroy((pg_t *)pg); 531fb2f18f8Sesaxe } 532fb2f18f8Sesaxe } 533fb2f18f8Sesaxe } 534fb2f18f8Sesaxe 535fb2f18f8Sesaxe /* 536fb2f18f8Sesaxe * Class callback when a CPU is entering a cpu partition 537fb2f18f8Sesaxe */ 538fb2f18f8Sesaxe static void 539fb2f18f8Sesaxe pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp) 540fb2f18f8Sesaxe { 541fb2f18f8Sesaxe group_t *pgs; 542fb2f18f8Sesaxe pg_t *pg; 543fb2f18f8Sesaxe group_iter_t i; 544fb2f18f8Sesaxe 545fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 546fb2f18f8Sesaxe 547fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 548fb2f18f8Sesaxe 549fb2f18f8Sesaxe /* 550fb2f18f8Sesaxe * Ensure that the new partition's PG bitset 551fb2f18f8Sesaxe * is large enough for all CMT PG's to which cp 552fb2f18f8Sesaxe * belongs 553fb2f18f8Sesaxe */ 554fb2f18f8Sesaxe group_iter_init(&i); 555fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 556fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 557fb2f18f8Sesaxe continue; 558fb2f18f8Sesaxe 559fb2f18f8Sesaxe if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id) 560fb2f18f8Sesaxe bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1); 561fb2f18f8Sesaxe } 562fb2f18f8Sesaxe } 563fb2f18f8Sesaxe 564fb2f18f8Sesaxe /* 565fb2f18f8Sesaxe * Class callback when a CPU is actually moving partitions 566fb2f18f8Sesaxe */ 567fb2f18f8Sesaxe static void 568fb2f18f8Sesaxe pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp) 569fb2f18f8Sesaxe { 570fb2f18f8Sesaxe cpu_t *cpp; 571fb2f18f8Sesaxe group_t *pgs; 572fb2f18f8Sesaxe pg_t *pg; 573fb2f18f8Sesaxe group_iter_t pg_iter; 574fb2f18f8Sesaxe pg_cpu_itr_t cpu_iter; 575fb2f18f8Sesaxe boolean_t found; 576fb2f18f8Sesaxe 577fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 578fb2f18f8Sesaxe 579fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 580fb2f18f8Sesaxe group_iter_init(&pg_iter); 581fb2f18f8Sesaxe 582fb2f18f8Sesaxe /* 583fb2f18f8Sesaxe * Iterate over the CPUs CMT PGs 584fb2f18f8Sesaxe */ 585fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &pg_iter)) != NULL) { 586fb2f18f8Sesaxe 587fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 588fb2f18f8Sesaxe continue; 589fb2f18f8Sesaxe 590fb2f18f8Sesaxe /* 591fb2f18f8Sesaxe * Add the PG to the bitset in the new partition. 592fb2f18f8Sesaxe */ 593fb2f18f8Sesaxe bitset_add(&newpp->cp_cmt_pgs, pg->pg_id); 594fb2f18f8Sesaxe 595fb2f18f8Sesaxe /* 596fb2f18f8Sesaxe * Remove the PG from the bitset in the old partition 597fb2f18f8Sesaxe * if the last of the PG's CPUs have left. 598fb2f18f8Sesaxe */ 599fb2f18f8Sesaxe found = B_FALSE; 600fb2f18f8Sesaxe PG_CPU_ITR_INIT(pg, cpu_iter); 601fb2f18f8Sesaxe while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) { 602fb2f18f8Sesaxe if (cpp == cp) 603fb2f18f8Sesaxe continue; 604a6604450Sesaxe if (CPU_ACTIVE(cpp) && 605a6604450Sesaxe cpp->cpu_part->cp_id == oldpp->cp_id) { 606fb2f18f8Sesaxe found = B_TRUE; 607fb2f18f8Sesaxe break; 608fb2f18f8Sesaxe } 609fb2f18f8Sesaxe } 610fb2f18f8Sesaxe if (!found) 611fb2f18f8Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id); 612fb2f18f8Sesaxe } 613fb2f18f8Sesaxe } 614fb2f18f8Sesaxe 615fb2f18f8Sesaxe /* 616fb2f18f8Sesaxe * Class callback when a CPU becomes active (online) 617fb2f18f8Sesaxe * 618fb2f18f8Sesaxe * This is called in a context where CPUs are paused 619fb2f18f8Sesaxe */ 620fb2f18f8Sesaxe static void 621fb2f18f8Sesaxe pg_cmt_cpu_active(cpu_t *cp) 622fb2f18f8Sesaxe { 623fb2f18f8Sesaxe int err; 624fb2f18f8Sesaxe group_iter_t i; 625fb2f18f8Sesaxe pg_cmt_t *pg; 626fb2f18f8Sesaxe group_t *pgs; 627fb2f18f8Sesaxe 628fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 629fb2f18f8Sesaxe 630fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 631fb2f18f8Sesaxe group_iter_init(&i); 632fb2f18f8Sesaxe 633fb2f18f8Sesaxe /* 634fb2f18f8Sesaxe * Iterate over the CPU's PGs 635fb2f18f8Sesaxe */ 636fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 637fb2f18f8Sesaxe 638fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 639fb2f18f8Sesaxe continue; 640fb2f18f8Sesaxe 641fb2f18f8Sesaxe err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 642fb2f18f8Sesaxe ASSERT(err == 0); 643fb2f18f8Sesaxe 644fb2f18f8Sesaxe /* 645fb2f18f8Sesaxe * If this is the first active CPU in the PG, and it 646fb2f18f8Sesaxe * represents a hardware sharing relationship over which 647fb2f18f8Sesaxe * CMT load balancing is performed, add it as a candidate 648fb2f18f8Sesaxe * for balancing with it's siblings. 649fb2f18f8Sesaxe */ 650fb2f18f8Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 && 651d129bde2Sesaxe pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { 652fb2f18f8Sesaxe err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE); 653fb2f18f8Sesaxe ASSERT(err == 0); 6546890d023SEric Saxe 6556890d023SEric Saxe /* 6566890d023SEric Saxe * If this is a top level PG, add it as a balancing 6576890d023SEric Saxe * candidate when balancing within the root lgroup 6586890d023SEric Saxe */ 6596890d023SEric Saxe if (pg->cmt_parent == NULL) { 6606890d023SEric Saxe err = group_add(&cmt_root->cl_pgs, pg, 6616890d023SEric Saxe GRP_NORESIZE); 6626890d023SEric Saxe ASSERT(err == 0); 6636890d023SEric Saxe } 664fb2f18f8Sesaxe } 665fb2f18f8Sesaxe 666fb2f18f8Sesaxe /* 667fb2f18f8Sesaxe * Notate the CPU in the PGs active CPU bitset. 668fb2f18f8Sesaxe * Also notate the PG as being active in it's associated 669fb2f18f8Sesaxe * partition 670fb2f18f8Sesaxe */ 671fb2f18f8Sesaxe bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 672fb2f18f8Sesaxe bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id); 673fb2f18f8Sesaxe } 674fb2f18f8Sesaxe } 675fb2f18f8Sesaxe 676fb2f18f8Sesaxe /* 677fb2f18f8Sesaxe * Class callback when a CPU goes inactive (offline) 678fb2f18f8Sesaxe * 679fb2f18f8Sesaxe * This is called in a context where CPUs are paused 680fb2f18f8Sesaxe */ 681fb2f18f8Sesaxe static void 682fb2f18f8Sesaxe pg_cmt_cpu_inactive(cpu_t *cp) 683fb2f18f8Sesaxe { 684fb2f18f8Sesaxe int err; 685fb2f18f8Sesaxe group_t *pgs; 686fb2f18f8Sesaxe pg_cmt_t *pg; 687fb2f18f8Sesaxe cpu_t *cpp; 688fb2f18f8Sesaxe group_iter_t i; 689fb2f18f8Sesaxe pg_cpu_itr_t cpu_itr; 690fb2f18f8Sesaxe boolean_t found; 691fb2f18f8Sesaxe 692fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 693fb2f18f8Sesaxe 694fb2f18f8Sesaxe pgs = &cp->cpu_pg->pgs; 695fb2f18f8Sesaxe group_iter_init(&i); 696fb2f18f8Sesaxe 697fb2f18f8Sesaxe while ((pg = group_iterate(pgs, &i)) != NULL) { 698fb2f18f8Sesaxe 699fb2f18f8Sesaxe if (IS_CMT_PG(pg) == 0) 700fb2f18f8Sesaxe continue; 701fb2f18f8Sesaxe 702fb2f18f8Sesaxe /* 703fb2f18f8Sesaxe * Remove the CPU from the CMT PGs active CPU group 704fb2f18f8Sesaxe * bitmap 705fb2f18f8Sesaxe */ 706fb2f18f8Sesaxe err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE); 707fb2f18f8Sesaxe ASSERT(err == 0); 708fb2f18f8Sesaxe 709fb2f18f8Sesaxe bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid); 710fb2f18f8Sesaxe 711fb2f18f8Sesaxe /* 712fb2f18f8Sesaxe * If there are no more active CPUs in this PG over which 713fb2f18f8Sesaxe * load was balanced, remove it as a balancing candidate. 714fb2f18f8Sesaxe */ 715fb2f18f8Sesaxe if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 && 716d129bde2Sesaxe pg_plat_cmt_load_bal_hw(((pghw_t *)pg)->pghw_hw)) { 717fb2f18f8Sesaxe err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE); 718fb2f18f8Sesaxe ASSERT(err == 0); 7196890d023SEric Saxe 7206890d023SEric Saxe if (pg->cmt_parent == NULL) { 7216890d023SEric Saxe err = group_remove(&cmt_root->cl_pgs, pg, 7226890d023SEric Saxe GRP_NORESIZE); 7236890d023SEric Saxe ASSERT(err == 0); 7246890d023SEric Saxe } 725fb2f18f8Sesaxe } 726fb2f18f8Sesaxe 727fb2f18f8Sesaxe /* 728fb2f18f8Sesaxe * Assert the number of active CPUs does not exceed 729fb2f18f8Sesaxe * the total number of CPUs in the PG 730fb2f18f8Sesaxe */ 731fb2f18f8Sesaxe ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <= 732fb2f18f8Sesaxe GROUP_SIZE(&((pg_t *)pg)->pg_cpus)); 733fb2f18f8Sesaxe 734fb2f18f8Sesaxe /* 735fb2f18f8Sesaxe * Update the PG bitset in the CPU's old partition 736fb2f18f8Sesaxe */ 737fb2f18f8Sesaxe found = B_FALSE; 738fb2f18f8Sesaxe PG_CPU_ITR_INIT(pg, cpu_itr); 739fb2f18f8Sesaxe while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) { 740fb2f18f8Sesaxe if (cpp == cp) 741fb2f18f8Sesaxe continue; 742a6604450Sesaxe if (CPU_ACTIVE(cpp) && 743a6604450Sesaxe cpp->cpu_part->cp_id == cp->cpu_part->cp_id) { 744fb2f18f8Sesaxe found = B_TRUE; 745fb2f18f8Sesaxe break; 746fb2f18f8Sesaxe } 747fb2f18f8Sesaxe } 748fb2f18f8Sesaxe if (!found) { 749fb2f18f8Sesaxe bitset_del(&cp->cpu_part->cp_cmt_pgs, 750fb2f18f8Sesaxe ((pg_t *)pg)->pg_id); 751fb2f18f8Sesaxe } 752fb2f18f8Sesaxe } 753fb2f18f8Sesaxe } 754fb2f18f8Sesaxe 755fb2f18f8Sesaxe /* 756fb2f18f8Sesaxe * Return non-zero if the CPU belongs in the given PG 757fb2f18f8Sesaxe */ 758fb2f18f8Sesaxe static int 759fb2f18f8Sesaxe pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp) 760fb2f18f8Sesaxe { 761fb2f18f8Sesaxe cpu_t *pg_cpu; 762fb2f18f8Sesaxe 763fb2f18f8Sesaxe pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0); 764fb2f18f8Sesaxe 765fb2f18f8Sesaxe ASSERT(pg_cpu != NULL); 766fb2f18f8Sesaxe 767fb2f18f8Sesaxe /* 768fb2f18f8Sesaxe * The CPU belongs if, given the nature of the hardware sharing 769fb2f18f8Sesaxe * relationship represented by the PG, the CPU has that 770fb2f18f8Sesaxe * relationship with some other CPU already in the PG 771fb2f18f8Sesaxe */ 772fb2f18f8Sesaxe if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw)) 773fb2f18f8Sesaxe return (1); 774fb2f18f8Sesaxe 775fb2f18f8Sesaxe return (0); 776fb2f18f8Sesaxe } 777fb2f18f8Sesaxe 778fb2f18f8Sesaxe /* 7796890d023SEric Saxe * Hierarchy packing utility routine. The hierarchy order is preserved. 780fb2f18f8Sesaxe */ 781fb2f18f8Sesaxe static void 7826890d023SEric Saxe pg_cmt_hier_pack(void *hier[], int sz) 783fb2f18f8Sesaxe { 784fb2f18f8Sesaxe int i, j; 785fb2f18f8Sesaxe 786fb2f18f8Sesaxe for (i = 0; i < sz; i++) { 787fb2f18f8Sesaxe if (hier[i] != NULL) 788fb2f18f8Sesaxe continue; 789fb2f18f8Sesaxe 790fb2f18f8Sesaxe for (j = i; j < sz; j++) { 791fb2f18f8Sesaxe if (hier[j] != NULL) { 792fb2f18f8Sesaxe hier[i] = hier[j]; 793fb2f18f8Sesaxe hier[j] = NULL; 794fb2f18f8Sesaxe break; 795fb2f18f8Sesaxe } 796fb2f18f8Sesaxe } 797fb2f18f8Sesaxe if (j == sz) 798fb2f18f8Sesaxe break; 799fb2f18f8Sesaxe } 800fb2f18f8Sesaxe } 801fb2f18f8Sesaxe 802fb2f18f8Sesaxe /* 803fb2f18f8Sesaxe * Return a cmt_lgrp_t * given an lgroup handle. 804fb2f18f8Sesaxe */ 805fb2f18f8Sesaxe static cmt_lgrp_t * 806fb2f18f8Sesaxe pg_cmt_find_lgrp(lgrp_handle_t hand) 807fb2f18f8Sesaxe { 808fb2f18f8Sesaxe cmt_lgrp_t *lgrp; 809fb2f18f8Sesaxe 810fb2f18f8Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 811fb2f18f8Sesaxe 812fb2f18f8Sesaxe lgrp = cmt_lgrps; 813fb2f18f8Sesaxe while (lgrp != NULL) { 814fb2f18f8Sesaxe if (lgrp->cl_hand == hand) 815a6604450Sesaxe break; 816fb2f18f8Sesaxe lgrp = lgrp->cl_next; 817fb2f18f8Sesaxe } 818a6604450Sesaxe return (lgrp); 819a6604450Sesaxe } 820fb2f18f8Sesaxe 821fb2f18f8Sesaxe /* 822a6604450Sesaxe * Create a cmt_lgrp_t with the specified handle. 823fb2f18f8Sesaxe */ 824a6604450Sesaxe static cmt_lgrp_t * 825a6604450Sesaxe pg_cmt_lgrp_create(lgrp_handle_t hand) 826a6604450Sesaxe { 827a6604450Sesaxe cmt_lgrp_t *lgrp; 828a6604450Sesaxe 829a6604450Sesaxe ASSERT(MUTEX_HELD(&cpu_lock)); 830a6604450Sesaxe 831fb2f18f8Sesaxe lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP); 832fb2f18f8Sesaxe 833fb2f18f8Sesaxe lgrp->cl_hand = hand; 834fb2f18f8Sesaxe lgrp->cl_npgs = 0; 835fb2f18f8Sesaxe lgrp->cl_next = cmt_lgrps; 836fb2f18f8Sesaxe cmt_lgrps = lgrp; 837fb2f18f8Sesaxe group_create(&lgrp->cl_pgs); 838fb2f18f8Sesaxe 839fb2f18f8Sesaxe return (lgrp); 840fb2f18f8Sesaxe } 8416890d023SEric Saxe 8426890d023SEric Saxe /* 8436890d023SEric Saxe * Perform multi-level CMT load balancing of running threads. 8446890d023SEric Saxe * 8456890d023SEric Saxe * tp is the thread being enqueued. 8466890d023SEric Saxe * cp is a hint CPU, against which CMT load balancing will be performed. 8476890d023SEric Saxe * 8486890d023SEric Saxe * Returns cp, or a CPU better than cp with respect to balancing 8496890d023SEric Saxe * running thread load. 8506890d023SEric Saxe */ 8516890d023SEric Saxe cpu_t * 8526890d023SEric Saxe cmt_balance(kthread_t *tp, cpu_t *cp) 8536890d023SEric Saxe { 8546890d023SEric Saxe int hint, i, cpu, nsiblings; 8556890d023SEric Saxe int self = 0; 8566890d023SEric Saxe group_t *cmt_pgs, *siblings; 8576890d023SEric Saxe pg_cmt_t *pg, *pg_tmp, *tpg = NULL; 8586890d023SEric Saxe int pg_nrun, tpg_nrun; 8596890d023SEric Saxe int level = 0; 8606890d023SEric Saxe cpu_t *newcp; 8616890d023SEric Saxe 8626890d023SEric Saxe ASSERT(THREAD_LOCK_HELD(tp)); 8636890d023SEric Saxe 8646890d023SEric Saxe cmt_pgs = &cp->cpu_pg->cmt_pgs; 8656890d023SEric Saxe 8666890d023SEric Saxe if (GROUP_SIZE(cmt_pgs) == 0) 8676890d023SEric Saxe return (cp); /* nothing to do */ 8686890d023SEric Saxe 8696890d023SEric Saxe if (tp == curthread) 8706890d023SEric Saxe self = 1; 8716890d023SEric Saxe 8726890d023SEric Saxe /* 8736890d023SEric Saxe * Balance across siblings in the CPUs CMT lineage 8746890d023SEric Saxe * If the thread is homed to the root lgroup, perform 8756890d023SEric Saxe * top level balancing against other top level PGs 8766890d023SEric Saxe * in the system. Otherwise, start with the default 8776890d023SEric Saxe * top level siblings group, which is within the leaf lgroup 8786890d023SEric Saxe */ 8796890d023SEric Saxe pg = GROUP_ACCESS(cmt_pgs, level); 8806890d023SEric Saxe if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) 8816890d023SEric Saxe siblings = &cmt_root->cl_pgs; 8826890d023SEric Saxe else 8836890d023SEric Saxe siblings = pg->cmt_siblings; 8846890d023SEric Saxe 8856890d023SEric Saxe /* 8866890d023SEric Saxe * Traverse down the lineage until we find a level that needs 8876890d023SEric Saxe * balancing, or we get to the end. 8886890d023SEric Saxe */ 8896890d023SEric Saxe for (;;) { 8906890d023SEric Saxe nsiblings = GROUP_SIZE(siblings); /* self inclusive */ 8916890d023SEric Saxe if (nsiblings == 1) 8926890d023SEric Saxe goto next_level; 8936890d023SEric Saxe 8946890d023SEric Saxe pg_nrun = pg->cmt_nrunning; 8956890d023SEric Saxe if (self && 8966890d023SEric Saxe bitset_in_set(&pg->cmt_cpus_actv_set, CPU->cpu_seqid)) 8976890d023SEric Saxe pg_nrun--; /* Ignore curthread's effect */ 8986890d023SEric Saxe 8996890d023SEric Saxe hint = CPU_PSEUDO_RANDOM() % nsiblings; 9006890d023SEric Saxe 9016890d023SEric Saxe /* 9026890d023SEric Saxe * Find a balancing candidate from among our siblings 9036890d023SEric Saxe * "hint" is a hint for where to start looking 9046890d023SEric Saxe */ 9056890d023SEric Saxe i = hint; 9066890d023SEric Saxe do { 9076890d023SEric Saxe ASSERT(i < nsiblings); 9086890d023SEric Saxe pg_tmp = GROUP_ACCESS(siblings, i); 9096890d023SEric Saxe 9106890d023SEric Saxe /* 9116890d023SEric Saxe * The candidate must not be us, and must 9126890d023SEric Saxe * have some CPU resources in the thread's 9136890d023SEric Saxe * partition 9146890d023SEric Saxe */ 9156890d023SEric Saxe if (pg_tmp != pg && 9166890d023SEric Saxe bitset_in_set(&tp->t_cpupart->cp_cmt_pgs, 9176890d023SEric Saxe ((pg_t *)pg_tmp)->pg_id)) { 9186890d023SEric Saxe tpg = pg_tmp; 9196890d023SEric Saxe break; 9206890d023SEric Saxe } 9216890d023SEric Saxe 9226890d023SEric Saxe if (++i >= nsiblings) 9236890d023SEric Saxe i = 0; 9246890d023SEric Saxe } while (i != hint); 9256890d023SEric Saxe 9266890d023SEric Saxe if (!tpg) 9276890d023SEric Saxe goto next_level; /* no candidates at this level */ 9286890d023SEric Saxe 9296890d023SEric Saxe /* 9306890d023SEric Saxe * Check if the balancing target is underloaded 9316890d023SEric Saxe * Decide to balance if the target is running fewer 9326890d023SEric Saxe * threads, or if it's running the same number of threads 9336890d023SEric Saxe * with more online CPUs 9346890d023SEric Saxe */ 9356890d023SEric Saxe tpg_nrun = tpg->cmt_nrunning; 9366890d023SEric Saxe if (pg_nrun > tpg_nrun || 9376890d023SEric Saxe (pg_nrun == tpg_nrun && 9386890d023SEric Saxe (GROUP_SIZE(&tpg->cmt_cpus_actv) > 9396890d023SEric Saxe GROUP_SIZE(&pg->cmt_cpus_actv)))) { 9406890d023SEric Saxe break; 9416890d023SEric Saxe } 9426890d023SEric Saxe tpg = NULL; 9436890d023SEric Saxe 9446890d023SEric Saxe next_level: 9456890d023SEric Saxe if (++level == GROUP_SIZE(cmt_pgs)) 9466890d023SEric Saxe break; 9476890d023SEric Saxe 9486890d023SEric Saxe pg = GROUP_ACCESS(cmt_pgs, level); 9496890d023SEric Saxe siblings = pg->cmt_siblings; 9506890d023SEric Saxe } 9516890d023SEric Saxe 9526890d023SEric Saxe if (tpg) { 9536890d023SEric Saxe uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv); 9546890d023SEric Saxe 9556890d023SEric Saxe /* 9566890d023SEric Saxe * Select an idle CPU from the target 9576890d023SEric Saxe */ 9586890d023SEric Saxe hint = CPU_PSEUDO_RANDOM() % tgt_size; 9596890d023SEric Saxe cpu = hint; 9606890d023SEric Saxe do { 9616890d023SEric Saxe newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu); 9626890d023SEric Saxe if (newcp->cpu_part == tp->t_cpupart && 9636890d023SEric Saxe newcp->cpu_dispatch_pri == -1) { 9646890d023SEric Saxe cp = newcp; 9656890d023SEric Saxe break; 9666890d023SEric Saxe } 9676890d023SEric Saxe if (++cpu == tgt_size) 9686890d023SEric Saxe cpu = 0; 9696890d023SEric Saxe } while (cpu != hint); 9706890d023SEric Saxe } 9716890d023SEric Saxe 9726890d023SEric Saxe return (cp); 9736890d023SEric Saxe } 974