1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/param.h> 29 #include <sys/thread.h> 30 #include <sys/cpuvar.h> 31 #include <sys/cpupart.h> 32 #include <sys/cmn_err.h> 33 #include <sys/disp.h> 34 #include <sys/group.h> 35 #include <sys/bitset.h> 36 #include <sys/lgrp.h> 37 #include <sys/cmt.h> 38 39 /* 40 * CMT dispatcher policies 41 * 42 * This file implements CMT dispatching policies using Processor Groups. 43 * 44 * The scheduler/dispatcher leverages knowledge of the performance 45 * relevant CMT sharing relationships existing between CPUs to implement 46 * load balancing, and coalescence thread placement policies. 47 * 48 * Load balancing policy seeks to improve performance by minimizing 49 * contention over shared processor resources / facilities. Coalescence 50 * policies improve resource utilization and ultimately power efficiency. 51 * 52 * On NUMA systems, the dispatcher will generally perform load balancing and 53 * coalescence within (and not across) lgroups. This is because there isn't 54 * much sense in trying to correct an imbalance by sending a thread outside 55 * of its home, if it would attempt to return home a short while later. 56 * The dispatcher will implement CMT policy across lgroups however, if 57 * it can do so with a thread homed to the root lgroup, since root homed 58 * threads have no lgroup affinity. 59 */ 60 61 /* 62 * Return non-zero if, given the policy, we should migrate from running 63 * somewhere "here" to somewhere "there". 64 */ 65 static int 66 cmt_should_migrate(pg_cmt_t *here, pg_cmt_t *there, pg_cmt_policy_t policy, 67 int self) 68 { 69 uint32_t here_util, there_util; 70 71 here_util = here->cmt_utilization; 72 there_util = there->cmt_utilization; 73 74 /* 75 * This assumes that curthread's utilization is "1" 76 */ 77 if (self && bitset_in_set(&here->cmt_cpus_actv_set, CPU->cpu_seqid)) 78 here_util--; /* Ignore curthread's effect */ 79 80 /* 81 * Load balancing and coalescence are conflicting policies 82 */ 83 ASSERT((policy & (CMT_BALANCE|CMT_COALESCE)) != 84 (CMT_BALANCE|CMT_COALESCE)); 85 86 if (policy & CMT_BALANCE) { 87 /* 88 * Balance utilization 89 * 90 * If the target is comparatively underutilized 91 * (either in an absolute sense, or scaled by capacity), 92 * then choose to balance. 93 */ 94 if ((here_util > there_util) || 95 (here_util == there_util && 96 (CMT_CAPACITY(there) > CMT_CAPACITY(here)))) { 97 return (1); 98 } 99 } else if (policy & CMT_COALESCE) { 100 /* 101 * Attempt to drive group utilization up to capacity 102 */ 103 if (there_util > here_util && 104 there_util < CMT_CAPACITY(there)) 105 return (1); 106 } 107 return (0); 108 } 109 110 /* 111 * Perform multi-level CMT load balancing of running threads. 112 * 113 * tp is the thread being enqueued. 114 * cp is a hint CPU, against which CMT load balancing will be performed. 115 * 116 * Returns cp, or a CPU better than cp with respect to balancing 117 * running thread load. 118 */ 119 cpu_t * 120 cmt_balance(kthread_t *tp, cpu_t *cp) 121 { 122 int hint, i, cpu, nsiblings; 123 int self = 0; 124 group_t *cmt_pgs, *siblings; 125 pg_cmt_t *pg, *pg_tmp, *tpg = NULL; 126 int level = 0; 127 cpu_t *newcp; 128 extern cmt_lgrp_t *cmt_root; 129 130 ASSERT(THREAD_LOCK_HELD(tp)); 131 132 cmt_pgs = &cp->cpu_pg->cmt_pgs; 133 134 if (GROUP_SIZE(cmt_pgs) == 0) 135 return (cp); /* nothing to do */ 136 137 if (tp == curthread) 138 self = 1; 139 140 /* 141 * Balance across siblings in the CPUs CMT lineage 142 * If the thread is homed to the root lgroup, perform 143 * top level balancing against other top level PGs 144 * in the system. Otherwise, start with the default 145 * top level siblings group, which is within the leaf lgroup 146 */ 147 pg = GROUP_ACCESS(cmt_pgs, level); 148 if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID) 149 siblings = &cmt_root->cl_pgs; 150 else 151 siblings = pg->cmt_siblings; 152 153 /* 154 * Traverse down the lineage until we find a level that needs 155 * balancing, or we get to the end. 156 */ 157 for (;;) { 158 nsiblings = GROUP_SIZE(siblings); /* self inclusive */ 159 if (nsiblings == 1) 160 goto next_level; 161 162 hint = CPU_PSEUDO_RANDOM() % nsiblings; 163 164 /* 165 * Find a balancing candidate from among our siblings 166 * "hint" is a hint for where to start looking 167 */ 168 i = hint; 169 do { 170 ASSERT(i < nsiblings); 171 pg_tmp = GROUP_ACCESS(siblings, i); 172 173 /* 174 * The candidate must not be us, and must 175 * have some CPU resources in the thread's 176 * partition 177 */ 178 if (pg_tmp != pg && 179 bitset_in_set(&tp->t_cpupart->cp_cmt_pgs, 180 ((pg_t *)pg_tmp)->pg_id)) { 181 tpg = pg_tmp; 182 break; 183 } 184 185 if (++i >= nsiblings) 186 i = 0; 187 } while (i != hint); 188 189 if (!tpg) 190 goto next_level; /* no candidates at this level */ 191 192 /* 193 * Decide if we should migrate from the current PG to a 194 * target PG given a policy 195 */ 196 if (cmt_should_migrate(pg, tpg, pg->cmt_policy, self)) 197 break; 198 tpg = NULL; 199 200 next_level: 201 if (++level == GROUP_SIZE(cmt_pgs)) 202 break; 203 204 pg = GROUP_ACCESS(cmt_pgs, level); 205 siblings = pg->cmt_siblings; 206 } 207 208 if (tpg) { 209 uint_t tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv); 210 211 /* 212 * Select an idle CPU from the target 213 */ 214 hint = CPU_PSEUDO_RANDOM() % tgt_size; 215 cpu = hint; 216 do { 217 newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu); 218 if (newcp->cpu_part == tp->t_cpupart && 219 newcp->cpu_dispatch_pri == -1) { 220 cp = newcp; 221 break; 222 } 223 if (++cpu == tgt_size) 224 cpu = 0; 225 } while (cpu != hint); 226 } 227 228 return (cp); 229 } 230