1*0e751525SEric Saxe /* 2*0e751525SEric Saxe * CDDL HEADER START 3*0e751525SEric Saxe * 4*0e751525SEric Saxe * The contents of this file are subject to the terms of the 5*0e751525SEric Saxe * Common Development and Distribution License (the "License"). 6*0e751525SEric Saxe * You may not use this file except in compliance with the License. 7*0e751525SEric Saxe * 8*0e751525SEric Saxe * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*0e751525SEric Saxe * or http://www.opensolaris.org/os/licensing. 10*0e751525SEric Saxe * See the License for the specific language governing permissions 11*0e751525SEric Saxe * and limitations under the License. 12*0e751525SEric Saxe * 13*0e751525SEric Saxe * When distributing Covered Code, include this CDDL HEADER in each 14*0e751525SEric Saxe * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*0e751525SEric Saxe * If applicable, add the following below this CDDL HEADER, with the 16*0e751525SEric Saxe * fields enclosed by brackets "[]" replaced with your own identifying 17*0e751525SEric Saxe * information: Portions Copyright [yyyy] [name of copyright owner] 18*0e751525SEric Saxe * 19*0e751525SEric Saxe * CDDL HEADER END 20*0e751525SEric Saxe */ 21*0e751525SEric Saxe /* 22*0e751525SEric Saxe * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23*0e751525SEric Saxe * Use is subject to license terms. 24*0e751525SEric Saxe */ 25*0e751525SEric Saxe 26*0e751525SEric Saxe #include <sys/systm.h> 27*0e751525SEric Saxe #include <sys/types.h> 28*0e751525SEric Saxe #include <sys/param.h> 29*0e751525SEric Saxe #include <sys/thread.h> 30*0e751525SEric Saxe #include <sys/cpuvar.h> 31*0e751525SEric Saxe #include <sys/cpupart.h> 32*0e751525SEric Saxe #include <sys/cmn_err.h> 33*0e751525SEric Saxe #include <sys/disp.h> 34*0e751525SEric Saxe #include <sys/group.h> 35*0e751525SEric Saxe #include <sys/bitset.h> 36*0e751525SEric Saxe #include <sys/lgrp.h> 37*0e751525SEric Saxe #include <sys/cmt.h> 38*0e751525SEric Saxe 39*0e751525SEric Saxe /* 40*0e751525SEric Saxe * CMT dispatcher policies 41*0e751525SEric Saxe * 42*0e751525SEric Saxe * This file implements CMT dispatching policies using Processor Groups. 43*0e751525SEric Saxe * 44*0e751525SEric Saxe * The scheduler/dispatcher leverages knowledge of the performance 45*0e751525SEric Saxe * relevant CMT sharing relationships existing between CPUs to implement 46*0e751525SEric Saxe * load balancing, and coalescence thread placement policies. 47*0e751525SEric Saxe * 48*0e751525SEric Saxe * Load balancing policy seeks to improve performance by minimizing 49*0e751525SEric Saxe * contention over shared processor resources / facilities. Coalescence 50*0e751525SEric Saxe * policies improve resource utilization and ultimately power efficiency. 51*0e751525SEric Saxe * 52*0e751525SEric Saxe * On NUMA systems, the dispatcher will generally perform load balancing and 53*0e751525SEric Saxe * coalescence within (and not across) lgroups. This is because there isn't 54*0e751525SEric Saxe * much sense in trying to correct an imbalance by sending a thread outside 55*0e751525SEric Saxe * of its home, if it would attempt to return home a short while later. 56*0e751525SEric Saxe * The dispatcher will implement CMT policy across lgroups however, if 57*0e751525SEric Saxe * it can do so with a thread homed to the root lgroup, since root homed 58*0e751525SEric Saxe * threads have no lgroup affinity. 59*0e751525SEric Saxe */ 60*0e751525SEric Saxe 61*0e751525SEric Saxe /* 62*0e751525SEric Saxe * Return non-zero if, given the policy, we should migrate from running 63*0e751525SEric Saxe * somewhere "here" to somewhere "there". 64*0e751525SEric Saxe */ 65*0e751525SEric Saxe static int 66*0e751525SEric Saxe cmt_should_migrate(pg_cmt_t *here, pg_cmt_t *there, pg_cmt_policy_t policy, 67*0e751525SEric Saxe int self) 68*0e751525SEric Saxe { 69*0e751525SEric Saxe uint32_t here_util, there_util; 70*0e751525SEric Saxe 71*0e751525SEric Saxe here_util = here->cmt_utilization; 72*0e751525SEric Saxe there_util = there->cmt_utilization; 73*0e751525SEric Saxe 74*0e751525SEric Saxe /* 75*0e751525SEric Saxe * This assumes that curthread's utilization is "1" 76*0e751525SEric Saxe */ 77*0e751525SEric Saxe if (self && bitset_in_set(&here->cmt_cpus_actv_set, CPU->cpu_seqid)) 78*0e751525SEric Saxe here_util--; /* Ignore curthread's effect */ 79*0e751525SEric Saxe 80*0e751525SEric Saxe /* 81*0e751525SEric Saxe * Load balancing and coalescence are conflicting policies 82*0e751525SEric Saxe */ 83*0e751525SEric Saxe ASSERT((policy & (CMT_BALANCE|CMT_COALESCE)) != 84*0e751525SEric Saxe (CMT_BALANCE|CMT_COALESCE)); 85*0e751525SEric Saxe 86*0e751525SEric Saxe if (policy & CMT_BALANCE) { 87*0e751525SEric Saxe /* 88*0e751525SEric Saxe * Balance utilization 89*0e751525SEric Saxe * 90*0e751525SEric Saxe * If the target is comparatively underutilized 91*0e751525SEric Saxe * (either in an absolute sense, or scaled by capacity), 92*0e751525SEric Saxe * then choose to balance. 93*0e751525SEric Saxe */ 94*0e751525SEric Saxe if ((here_util > there_util) || 95*0e751525SEric Saxe (here_util == there_util && 96*0e751525SEric Saxe (CMT_CAPACITY(there) > CMT_CAPACITY(here)))) { 97*0e751525SEric Saxe return (1); 98*0e751525SEric Saxe } 99*0e751525SEric Saxe } else if (policy & CMT_COALESCE) { 100*0e751525SEric Saxe /* 101*0e751525SEric Saxe * Attempt to drive group utilization up to capacity 102*0e751525SEric Saxe */ 103*0e751525SEric Saxe if (there_util > here_util && 104*0e751525SEric Saxe there_util < CMT_CAPACITY(there)) 105*0e751525SEric Saxe return (1); 106*0e751525SEric Saxe } 107*0e751525SEric Saxe return (0); 108*0e751525SEric Saxe } 109*0e751525SEric Saxe 110*0e751525SEric Saxe /* 111*0e751525SEric Saxe * Perform multi-level CMT load balancing of running threads. 112*0e751525SEric Saxe * 113*0e751525SEric Saxe * tp is the thread being enqueued. 114*0e751525SEric Saxe * cp is a hint CPU, against which CMT load balancing will be performed. 115*0e751525SEric Saxe * 116*0e751525SEric Saxe * Returns cp, or a CPU better than cp with respect to balancing 117*0e751525SEric Saxe * running thread load. 118*0e751525SEric Saxe */ 119*0e751525SEric Saxe cpu_t * 120*0e751525SEric Saxe cmt_balance(kthread_t *tp, cpu_t *cp) 121*0e751525SEric Saxe { 122*0e751525SEric Saxe int hint, i, cpu, nsiblings; 123*0e751525SEric Saxe int self = 0; 124*0e751525SEric Saxe group_t *cmt_pgs, *siblings; 125*0e751525SEric Saxe pg_cmt_t *pg, *pg_tmp, *tpg = NULL; 126*0e751525SEric Saxe int level = 0; 127*0e751525SEric Saxe cpu_t *newcp; 128*0e751525SEric Saxe extern cmt_lgrp_t *cmt_root; 129*0e751525SEric Saxe 130*0e751525SEric Saxe ASSERT(THREAD_LOCK_HELD(tp)); 131*0e751525SEric Saxe 132*0e751525SEric Saxe cmt_pgs = &cp->cpu_pg->cmt_pgs; 133*0e751525SEric Saxe 134*0e751525SEric Saxe if (GROUP_SIZE(cmt_pgs) == 0) 135*0e751525SEric Saxe return (cp); /* nothing to do */ 136*0e751525SEric Saxe 137*0e751525SEric Saxe if (tp == curthread) 138*0e751525SEric Saxe self = 1; 139*0e751525SEric Saxe 140*0e751525SEric Saxe /* 141*0e751525SEric Saxe * Balance across siblings in the CPUs CMT lineage 142*0e751525SEric Saxe * If the thread is homed to the root lgroup, perform 143*0e751525SEric Saxe * top level balancing against other top level PGs 144*0e751525SEric Saxe * in the system. Otherwise, start with the default 145*0e751525SEric Saxe * top level siblings group, which is within the leaf lgroup 146*0e751525SEric Saxe */ 147*0e751525SEric Saxe pg = GROUP_ACCESS(cmt_pgs, level); 148*0e751525SEric Saxe if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) 149*0e751525SEric Saxe siblings = &cmt_root->cl_pgs; 150*0e751525SEric Saxe else 151*0e751525SEric Saxe siblings = pg->cmt_siblings; 152*0e751525SEric Saxe 153*0e751525SEric Saxe /* 154*0e751525SEric Saxe * Traverse down the lineage until we find a level that needs 155*0e751525SEric Saxe * balancing, or we get to the end. 156*0e751525SEric Saxe */ 157*0e751525SEric Saxe for (;;) { 158*0e751525SEric Saxe nsiblings = GROUP_SIZE(siblings); /* self inclusive */ 159*0e751525SEric Saxe if (nsiblings == 1) 160*0e751525SEric Saxe goto next_level; 161*0e751525SEric Saxe 162*0e751525SEric Saxe hint = CPU_PSEUDO_RANDOM() % nsiblings; 163*0e751525SEric Saxe 164*0e751525SEric Saxe /* 165*0e751525SEric Saxe * Find a balancing candidate from among our siblings 166*0e751525SEric Saxe * "hint" is a hint for where to start looking 167*0e751525SEric Saxe */ 168*0e751525SEric Saxe i = hint; 169*0e751525SEric Saxe do { 170*0e751525SEric Saxe ASSERT(i < nsiblings); 171*0e751525SEric Saxe pg_tmp = GROUP_ACCESS(siblings, i); 172*0e751525SEric Saxe 173*0e751525SEric Saxe /* 174*0e751525SEric Saxe * The candidate must not be us, and must 175*0e751525SEric Saxe * have some CPU resources in the thread's 176*0e751525SEric Saxe * partition 177*0e751525SEric Saxe */ 178*0e751525SEric Saxe if (pg_tmp != pg && 179*0e751525SEric Saxe bitset_in_set(&tp->t_cpupart->cp_cmt_pgs, 180*0e751525SEric Saxe ((pg_t *)pg_tmp)->pg_id)) { 181*0e751525SEric Saxe tpg = pg_tmp; 182*0e751525SEric Saxe break; 183*0e751525SEric Saxe } 184*0e751525SEric Saxe 185*0e751525SEric Saxe if (++i >= nsiblings) 186*0e751525SEric Saxe i = 0; 187*0e751525SEric Saxe } while (i != hint); 188*0e751525SEric Saxe 189*0e751525SEric Saxe if (!tpg) 190*0e751525SEric Saxe goto next_level; /* no candidates at this level */ 191*0e751525SEric Saxe 192*0e751525SEric Saxe /* 193*0e751525SEric Saxe * Decide if we should migrate from the current PG to a 194*0e751525SEric Saxe * target PG given a policy 195*0e751525SEric Saxe */ 196*0e751525SEric Saxe if (cmt_should_migrate(pg, tpg, pg->cmt_policy, self)) 197*0e751525SEric Saxe break; 198*0e751525SEric Saxe tpg = NULL; 199*0e751525SEric Saxe 200*0e751525SEric Saxe next_level: 201*0e751525SEric Saxe if (++level == GROUP_SIZE(cmt_pgs)) 202*0e751525SEric Saxe break; 203*0e751525SEric Saxe 204*0e751525SEric Saxe pg = GROUP_ACCESS(cmt_pgs, level); 205*0e751525SEric Saxe siblings = pg->cmt_siblings; 206*0e751525SEric Saxe } 207*0e751525SEric Saxe 208*0e751525SEric Saxe if (tpg) { 209*0e751525SEric Saxe uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv); 210*0e751525SEric Saxe 211*0e751525SEric Saxe /* 212*0e751525SEric Saxe * Select an idle CPU from the target 213*0e751525SEric Saxe */ 214*0e751525SEric Saxe hint = CPU_PSEUDO_RANDOM() % tgt_size; 215*0e751525SEric Saxe cpu = hint; 216*0e751525SEric Saxe do { 217*0e751525SEric Saxe newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu); 218*0e751525SEric Saxe if (newcp->cpu_part == tp->t_cpupart && 219*0e751525SEric Saxe newcp->cpu_dispatch_pri == -1) { 220*0e751525SEric Saxe cp = newcp; 221*0e751525SEric Saxe break; 222*0e751525SEric Saxe } 223*0e751525SEric Saxe if (++cpu == tgt_size) 224*0e751525SEric Saxe cpu = 0; 225*0e751525SEric Saxe } while (cpu != hint); 226*0e751525SEric Saxe } 227*0e751525SEric Saxe 228*0e751525SEric Saxe return (cp); 229*0e751525SEric Saxe } 230