1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Basic NUMA support in terms of locality groups 28 * 29 * Solaris needs to know which CPUs, memory, etc. are near each other to 30 * provide good performance on NUMA machines by optimizing for locality. 31 * In order to do this, a new abstraction called a "locality group (lgroup)" 32 * has been introduced to keep track of which CPU-like and memory-like hardware 33 * resources are close to each other. Currently, latency is the only measure 34 * used to determine how to group hardware resources into lgroups, but this 35 * does not limit the groupings to be based solely on latency. Other factors 36 * may be used to determine the groupings in the future. 37 * 38 * Lgroups are organized into a hieararchy or topology that represents the 39 * latency topology of the machine. There is always at least a root lgroup in 40 * the system. It represents all the hardware resources in the machine at a 41 * latency big enough that any hardware resource can at least access any other 42 * hardware resource within that latency. A Uniform Memory Access (UMA) 43 * machine is represented with one lgroup (the root). In contrast, a NUMA 44 * machine is represented at least by the root lgroup and some number of leaf 45 * lgroups where the leaf lgroups contain the hardware resources within the 46 * least latency of each other and the root lgroup still contains all the 47 * resources in the machine. Some number of intermediate lgroups may exist 48 * which represent more levels of locality than just the local latency of the 49 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 50 * (eg. root and intermediate lgroups) contain the next nearest resources to 51 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 52 * to the root lgroup shows the hardware resources from closest to farthest 53 * from the leaf lgroup such that each successive ancestor lgroup contains 54 * the next nearest resources at the next level of locality from the previous. 55 * 56 * The kernel uses the lgroup abstraction to know how to allocate resources 57 * near a given process/thread. At fork() and lwp/thread_create() time, a 58 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 59 * with the lowest load average. Binding to a processor or processor set will 60 * change the home lgroup for a thread. The scheduler has been modified to try 61 * to dispatch a thread on a CPU in its home lgroup. Physical memory 62 * allocation is lgroup aware too, so memory will be allocated from the current 63 * thread's home lgroup if possible. If the desired resources are not 64 * available, the kernel traverses the lgroup hierarchy going to the parent 65 * lgroup to find resources at the next level of locality until it reaches the 66 * root lgroup. 67 */ 68 69 #include <sys/lgrp.h> 70 #include <sys/lgrp_user.h> 71 #include <sys/types.h> 72 #include <sys/mman.h> 73 #include <sys/param.h> 74 #include <sys/var.h> 75 #include <sys/thread.h> 76 #include <sys/cpuvar.h> 77 #include <sys/cpupart.h> 78 #include <sys/kmem.h> 79 #include <vm/seg.h> 80 #include <vm/seg_kmem.h> 81 #include <vm/seg_spt.h> 82 #include <vm/seg_vn.h> 83 #include <vm/as.h> 84 #include <sys/atomic.h> 85 #include <sys/systm.h> 86 #include <sys/errno.h> 87 #include <sys/cmn_err.h> 88 #include <sys/kstat.h> 89 #include <sys/sysmacros.h> 90 #include <sys/pg.h> 91 #include <sys/promif.h> 92 #include <sys/sdt.h> 93 94 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 95 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 96 /* indexed by lgrp_id */ 97 int nlgrps; /* number of lgroups in machine */ 98 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 99 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 100 101 /* 102 * Kstat data for lgroups. 103 * 104 * Actual kstat data is collected in lgrp_stats array. 105 * The lgrp_kstat_data array of named kstats is used to extract data from 106 * lgrp_stats and present it to kstat framework. It is protected from partallel 107 * modifications by lgrp_kstat_mutex. This may cause some contention when 108 * several kstat commands run in parallel but this is not the 109 * performance-critical path. 110 */ 111 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 112 113 /* 114 * Declare kstat names statically for enums as defined in the header file. 115 */ 116 LGRP_KSTAT_NAMES; 117 118 static void lgrp_kstat_init(void); 119 static int lgrp_kstat_extract(kstat_t *, int); 120 static void lgrp_kstat_reset(lgrp_id_t); 121 122 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 123 static kmutex_t lgrp_kstat_mutex; 124 125 126 /* 127 * max number of lgroups supported by the platform 128 */ 129 int nlgrpsmax = 0; 130 131 /* 132 * The root lgroup. Represents the set of resources at the system wide 133 * level of locality. 134 */ 135 lgrp_t *lgrp_root = NULL; 136 137 /* 138 * During system bootstrap cp_default does not contain the list of lgrp load 139 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 140 * on-line when cp_default is initialized by cpupart_initialize_default(). 141 * Configuring CPU0 may create a two-level topology with root and one leaf node 142 * containing CPU0. This topology is initially constructed in a special 143 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 144 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 145 * for all lpl operations until cp_default is fully constructed. 146 * 147 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 148 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 149 * the first element of lpl_bootstrap_list. 150 * 151 * CPUs that are added to the system, but have not yet been assigned to an 152 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 153 * on some architectures (x86) it's possible for the slave CPU startup thread 154 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 155 */ 156 #define LPL_BOOTSTRAP_SIZE 2 157 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 158 lpl_t *lpl_bootstrap; 159 static lpl_t *lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE]; 160 static int lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE]; 161 162 /* 163 * If cp still references the bootstrap lpl, it has not yet been added to 164 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 165 * a thread is trying to allocate memory close to a CPU that has no lgrp. 166 */ 167 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 168 169 static lgrp_t lroot; 170 171 /* 172 * Size, in bytes, beyond which random memory allocation policy is applied 173 * to non-shared memory. Default is the maximum size, so random memory 174 * allocation won't be used for non-shared memory by default. 175 */ 176 size_t lgrp_privm_random_thresh = (size_t)(-1); 177 178 /* the maximum effect that a single thread can have on it's lgroup's load */ 179 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 180 ((lgrp_loadavg_max_effect) / (ncpu)) 181 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 182 183 184 /* 185 * Size, in bytes, beyond which random memory allocation policy is applied to 186 * shared memory. Default is 8MB (2 ISM pages). 187 */ 188 size_t lgrp_shm_random_thresh = 8*1024*1024; 189 190 /* 191 * Whether to do processor set aware memory allocation by default 192 */ 193 int lgrp_mem_pset_aware = 0; 194 195 /* 196 * Set the default memory allocation policy for root lgroup 197 */ 198 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 199 200 /* 201 * Set the default memory allocation policy. For most platforms, 202 * next touch is sufficient, but some platforms may wish to override 203 * this. 204 */ 205 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 206 207 208 /* 209 * lgroup CPU event handlers 210 */ 211 static void lgrp_cpu_init(struct cpu *); 212 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 213 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 214 215 /* 216 * lgroup memory event handlers 217 */ 218 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 219 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 220 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 221 222 /* 223 * lgroup CPU partition event handlers 224 */ 225 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 226 static void lgrp_part_del_cpu(struct cpu *); 227 228 static void lgrp_root_init(void); 229 230 /* 231 * lpl topology 232 */ 233 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 234 static void lpl_clear(lpl_t *); 235 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 236 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 237 static void lpl_rset_add(lpl_t *, lpl_t *); 238 static void lpl_rset_del(lpl_t *, lpl_t *); 239 static int lpl_rset_contains(lpl_t *, lpl_t *); 240 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 241 static void lpl_child_update(lpl_t *, struct cpupart *); 242 static int lpl_pick(lpl_t *, lpl_t *); 243 static void lpl_verify_wrapper(struct cpupart *); 244 245 /* 246 * defines for lpl topology verifier return codes 247 */ 248 249 #define LPL_TOPO_CORRECT 0 250 #define LPL_TOPO_PART_HAS_NO_LPL -1 251 #define LPL_TOPO_CPUS_NOT_EMPTY -2 252 #define LPL_TOPO_LGRP_MISMATCH -3 253 #define LPL_TOPO_MISSING_PARENT -4 254 #define LPL_TOPO_PARENT_MISMATCH -5 255 #define LPL_TOPO_BAD_CPUCNT -6 256 #define LPL_TOPO_RSET_MISMATCH -7 257 #define LPL_TOPO_LPL_ORPHANED -8 258 #define LPL_TOPO_LPL_BAD_NCPU -9 259 #define LPL_TOPO_RSET_MSSNG_LF -10 260 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 261 #define LPL_TOPO_NONLEAF_HAS_CPUS -12 262 #define LPL_TOPO_LGRP_NOT_LEAF -13 263 #define LPL_TOPO_BAD_RSETCNT -14 264 265 /* 266 * Return whether lgroup optimizations should be enabled on this system 267 */ 268 int 269 lgrp_optimizations(void) 270 { 271 /* 272 * System must have more than 2 lgroups to enable lgroup optimizations 273 * 274 * XXX This assumes that a 2 lgroup system has an empty root lgroup 275 * with one child lgroup containing all the resources. A 2 lgroup 276 * system with a root lgroup directly containing CPUs or memory might 277 * need lgroup optimizations with its child lgroup, but there 278 * isn't such a machine for now.... 279 */ 280 if (nlgrps > 2) 281 return (1); 282 283 return (0); 284 } 285 286 /* 287 * Build full lgroup topology 288 */ 289 static void 290 lgrp_root_init(void) 291 { 292 lgrp_handle_t hand; 293 int i; 294 lgrp_id_t id; 295 296 /* 297 * Create the "root" lgroup 298 */ 299 ASSERT(nlgrps == 0); 300 id = nlgrps++; 301 302 lgrp_root = &lroot; 303 304 lgrp_root->lgrp_cpu = NULL; 305 lgrp_root->lgrp_mnodes = 0; 306 lgrp_root->lgrp_nmnodes = 0; 307 hand = lgrp_plat_root_hand(); 308 lgrp_root->lgrp_plathand = hand; 309 310 lgrp_root->lgrp_id = id; 311 lgrp_root->lgrp_cpucnt = 0; 312 lgrp_root->lgrp_childcnt = 0; 313 klgrpset_clear(lgrp_root->lgrp_children); 314 klgrpset_clear(lgrp_root->lgrp_leaves); 315 lgrp_root->lgrp_parent = NULL; 316 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 317 318 for (i = 0; i < LGRP_RSRC_COUNT; i++) 319 klgrpset_clear(lgrp_root->lgrp_set[i]); 320 321 lgrp_root->lgrp_kstat = NULL; 322 323 lgrp_table[id] = lgrp_root; 324 325 /* 326 * Setup initial lpl list for CPU0 and initial t0 home. 327 * The only lpl space we have so far is lpl_bootstrap. It is used for 328 * all topology operations until cp_default is initialized at which 329 * point t0.t_lpl will be updated. 330 */ 331 lpl_bootstrap = lpl_bootstrap_list; 332 t0.t_lpl = lpl_bootstrap; 333 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 334 lpl_bootstrap_list[1].lpl_lgrpid = 1; 335 336 /* 337 * Set up the bootstrap rset 338 * Since the bootstrap toplogy has just the root, and a leaf, 339 * the rset contains just the leaf, and both lpls can use the same rset 340 */ 341 lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1]; 342 lpl_bootstrap_list[0].lpl_rset_sz = 1; 343 lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset; 344 lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset; 345 346 lpl_bootstrap_list[1].lpl_rset_sz = 1; 347 lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset; 348 lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset; 349 350 cp_default.cp_lgrploads = lpl_bootstrap; 351 } 352 353 /* 354 * Initialize the lgroup framework and allow the platform to do the same 355 */ 356 void 357 lgrp_init(void) 358 { 359 /* 360 * Initialize the platform 361 */ 362 lgrp_plat_init(); 363 364 /* 365 * Set max number of lgroups supported on this platform which must be 366 * less than the max number of lgroups supported by the common lgroup 367 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 368 */ 369 nlgrpsmax = lgrp_plat_max_lgrps(); 370 ASSERT(nlgrpsmax <= NLGRPS_MAX); 371 } 372 373 /* 374 * Create the root and cpu0's lgroup, and set t0's home. 375 */ 376 void 377 lgrp_setup(void) 378 { 379 /* 380 * Setup the root lgroup 381 */ 382 lgrp_root_init(); 383 384 /* 385 * Add cpu0 to an lgroup 386 */ 387 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 388 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 389 } 390 391 /* 392 * Lgroup initialization is split in two parts. The first part 393 * (lgrp_main_init()) is called right before start_other_cpus() in main. The 394 * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 395 * when all CPUs are brought online and all distance information is available. 396 * 397 * When lgrp_main_init() is complete it sets lgrp_initialized. The 398 * lgrp_main_mp_init() sets lgrp_topo_initialized. 399 */ 400 401 /* 402 * true when lgrp initialization has been completed. 403 */ 404 int lgrp_initialized = 0; 405 406 /* 407 * True when lgrp topology is constructed. 408 */ 409 int lgrp_topo_initialized = 0; 410 411 /* 412 * Init routine called after startup(), /etc/system has been processed, 413 * and cpu0 has been added to an lgroup. 414 */ 415 void 416 lgrp_main_init(void) 417 { 418 cpu_t *cp = CPU; 419 lgrp_id_t lgrpid; 420 int i; 421 extern void pg_cpu0_reinit(); 422 423 /* 424 * Enforce a valid lgrp_mem_default_policy 425 */ 426 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 427 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) || 428 (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG)) 429 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 430 431 /* 432 * See if mpo should be disabled. 433 * This may happen in the case of null proc LPA on Starcat. 434 * The platform won't be able to detect null proc LPA until after 435 * cpu0 and memory have already been added to lgroups. 436 * When and if it is detected, the Starcat platform will return 437 * a different platform handle for cpu0 which is what we check for 438 * here. If mpo should be disabled move cpu0 to it's rightful place 439 * (the root), and destroy the remaining lgroups. This effectively 440 * provides an UMA lgroup topology. 441 */ 442 lgrpid = cp->cpu_lpl->lpl_lgrpid; 443 if (lgrp_table[lgrpid]->lgrp_plathand != 444 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 445 lgrp_part_del_cpu(cp); 446 lgrp_cpu_fini(cp, lgrpid); 447 448 lgrp_cpu_init(cp); 449 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 450 451 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 452 453 /* 454 * Notify the PG subsystem that the CPU's lgrp 455 * association has changed 456 */ 457 pg_cpu0_reinit(); 458 459 /* 460 * Destroy all lgroups except for root 461 */ 462 for (i = 0; i <= lgrp_alloc_max; i++) { 463 if (LGRP_EXISTS(lgrp_table[i]) && 464 lgrp_table[i] != lgrp_root) 465 lgrp_destroy(lgrp_table[i]); 466 } 467 468 /* 469 * Fix up root to point at itself for leaves and resources 470 * and not have any children 471 */ 472 lgrp_root->lgrp_childcnt = 0; 473 klgrpset_clear(lgrp_root->lgrp_children); 474 klgrpset_clear(lgrp_root->lgrp_leaves); 475 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 476 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 477 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 478 } 479 480 /* 481 * Initialize kstats framework. 482 */ 483 lgrp_kstat_init(); 484 /* 485 * cpu0 is finally where it should be, so create it's lgroup's kstats 486 */ 487 mutex_enter(&cpu_lock); 488 lgrp_kstat_create(cp); 489 mutex_exit(&cpu_lock); 490 491 lgrp_plat_main_init(); 492 lgrp_initialized = 1; 493 } 494 495 /* 496 * Finish lgrp initialization after all CPUS are brought on-line. 497 * This routine is called after start_other_cpus(). 498 */ 499 void 500 lgrp_main_mp_init(void) 501 { 502 klgrpset_t changed; 503 504 /* 505 * Update lgroup topology (if necessary) 506 */ 507 klgrpset_clear(changed); 508 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 509 lgrp_topo_initialized = 1; 510 } 511 512 /* 513 * Change latency of lgroup with specified lgroup platform handle (if one is 514 * given) or change all lgroups with old latency to new latency 515 */ 516 void 517 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime, 518 u_longlong_t newtime) 519 { 520 lgrp_t *lgrp; 521 int i; 522 523 for (i = 0; i <= lgrp_alloc_max; i++) { 524 lgrp = lgrp_table[i]; 525 526 if (!LGRP_EXISTS(lgrp)) 527 continue; 528 529 if ((hand == LGRP_NULL_HANDLE && 530 lgrp->lgrp_latency == oldtime) || 531 (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand)) 532 lgrp->lgrp_latency = (int)newtime; 533 } 534 } 535 536 /* 537 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 538 */ 539 void 540 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 541 { 542 klgrpset_t changed; 543 cpu_t *cp; 544 lgrp_id_t id; 545 int rc; 546 547 switch (event) { 548 /* 549 * The following (re)configuration events are common code 550 * initiated. lgrp_plat_config() is called here to inform the 551 * platform of the reconfiguration event. 552 */ 553 case LGRP_CONFIG_CPU_ADD: 554 cp = (cpu_t *)resource; 555 556 /* 557 * Initialize the new CPU's lgrp related next/prev 558 * links, and give it a bootstrap lpl so that it can 559 * survive should it need to enter the dispatcher. 560 */ 561 cp->cpu_next_lpl = cp; 562 cp->cpu_prev_lpl = cp; 563 cp->cpu_next_lgrp = cp; 564 cp->cpu_prev_lgrp = cp; 565 cp->cpu_lpl = lpl_bootstrap; 566 567 lgrp_plat_config(event, resource); 568 atomic_add_32(&lgrp_gen, 1); 569 570 break; 571 case LGRP_CONFIG_CPU_DEL: 572 lgrp_plat_config(event, resource); 573 atomic_add_32(&lgrp_gen, 1); 574 575 break; 576 case LGRP_CONFIG_CPU_ONLINE: 577 cp = (cpu_t *)resource; 578 lgrp_cpu_init(cp); 579 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 580 rc = lpl_topo_verify(cp->cpu_part); 581 if (rc != LPL_TOPO_CORRECT) { 582 panic("lpl_topo_verify failed: %d", rc); 583 } 584 lgrp_plat_config(event, resource); 585 atomic_add_32(&lgrp_gen, 1); 586 587 break; 588 case LGRP_CONFIG_CPU_OFFLINE: 589 cp = (cpu_t *)resource; 590 id = cp->cpu_lpl->lpl_lgrpid; 591 lgrp_part_del_cpu(cp); 592 lgrp_cpu_fini(cp, id); 593 rc = lpl_topo_verify(cp->cpu_part); 594 if (rc != LPL_TOPO_CORRECT) { 595 panic("lpl_topo_verify failed: %d", rc); 596 } 597 lgrp_plat_config(event, resource); 598 atomic_add_32(&lgrp_gen, 1); 599 600 break; 601 case LGRP_CONFIG_CPUPART_ADD: 602 cp = (cpu_t *)resource; 603 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 604 rc = lpl_topo_verify(cp->cpu_part); 605 if (rc != LPL_TOPO_CORRECT) { 606 panic("lpl_topo_verify failed: %d", rc); 607 } 608 lgrp_plat_config(event, resource); 609 610 break; 611 case LGRP_CONFIG_CPUPART_DEL: 612 cp = (cpu_t *)resource; 613 lgrp_part_del_cpu((cpu_t *)resource); 614 rc = lpl_topo_verify(cp->cpu_part); 615 if (rc != LPL_TOPO_CORRECT) { 616 panic("lpl_topo_verify failed: %d", rc); 617 } 618 lgrp_plat_config(event, resource); 619 620 break; 621 /* 622 * The following events are initiated by the memnode 623 * subsystem. 624 */ 625 case LGRP_CONFIG_MEM_ADD: 626 lgrp_mem_init((int)resource, where, B_FALSE); 627 atomic_add_32(&lgrp_gen, 1); 628 629 break; 630 case LGRP_CONFIG_MEM_DEL: 631 lgrp_mem_fini((int)resource, where, B_FALSE); 632 atomic_add_32(&lgrp_gen, 1); 633 634 break; 635 case LGRP_CONFIG_MEM_RENAME: { 636 lgrp_config_mem_rename_t *ren_arg = 637 (lgrp_config_mem_rename_t *)where; 638 639 lgrp_mem_rename((int)resource, 640 ren_arg->lmem_rename_from, 641 ren_arg->lmem_rename_to); 642 atomic_add_32(&lgrp_gen, 1); 643 644 break; 645 } 646 case LGRP_CONFIG_GEN_UPDATE: 647 atomic_add_32(&lgrp_gen, 1); 648 649 break; 650 case LGRP_CONFIG_FLATTEN: 651 if (where == 0) 652 lgrp_topo_levels = (int)resource; 653 else 654 (void) lgrp_topo_flatten(resource, 655 lgrp_table, lgrp_alloc_max, &changed); 656 657 break; 658 /* 659 * Update any lgroups with old latency to new latency 660 */ 661 case LGRP_CONFIG_LAT_CHANGE_ALL: 662 lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource, 663 (u_longlong_t)where); 664 665 break; 666 /* 667 * Update lgroup with specified lgroup platform handle to have 668 * new latency 669 */ 670 case LGRP_CONFIG_LAT_CHANGE: 671 lgrp_latency_change((lgrp_handle_t)resource, 0, 672 (u_longlong_t)where); 673 674 break; 675 case LGRP_CONFIG_NOP: 676 677 break; 678 default: 679 break; 680 } 681 682 } 683 684 /* 685 * Called to add lgrp info into cpu structure from cpu_add_unit; 686 * do not assume cpu is in cpu[] yet! 687 * 688 * CPUs are brought online with all other CPUs paused so we can't 689 * allocate memory or we could deadlock the system, so we rely on 690 * the platform to statically allocate as much space as we need 691 * for the lgrp structs and stats. 692 */ 693 static void 694 lgrp_cpu_init(struct cpu *cp) 695 { 696 klgrpset_t changed; 697 int count; 698 lgrp_handle_t hand; 699 int first_cpu; 700 lgrp_t *my_lgrp; 701 lgrp_id_t lgrpid; 702 struct cpu *cptr; 703 704 /* 705 * This is the first time through if the resource set 706 * for the root lgroup is empty. After cpu0 has been 707 * initially added to an lgroup, the root's CPU resource 708 * set can never be empty, since the system's last CPU 709 * cannot be offlined. 710 */ 711 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 712 /* 713 * First time through. 714 */ 715 first_cpu = 1; 716 } else { 717 /* 718 * If cpu0 needs to move lgroups, we may come 719 * through here again, at which time cpu_lock won't 720 * be held, and lgrp_initialized will be false. 721 */ 722 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 723 ASSERT(cp->cpu_part != NULL); 724 first_cpu = 0; 725 } 726 727 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 728 my_lgrp = lgrp_hand_to_lgrp(hand); 729 730 if (my_lgrp == NULL) { 731 /* 732 * Create new lgrp and add it to lgroup topology 733 */ 734 my_lgrp = lgrp_create(); 735 my_lgrp->lgrp_plathand = hand; 736 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 737 lgrpid = my_lgrp->lgrp_id; 738 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 739 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 740 741 count = 0; 742 klgrpset_clear(changed); 743 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 744 &changed); 745 /* 746 * May have added new intermediate lgroups, so need to add 747 * resources other than CPUs which are added below 748 */ 749 (void) lgrp_mnode_update(changed, NULL); 750 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 751 > 0) { 752 /* 753 * Leaf lgroup was created, but latency wasn't available 754 * then. So, set latency for it and fill in rest of lgroup 755 * topology now that we know how far it is from other leaf 756 * lgroups. 757 */ 758 lgrpid = my_lgrp->lgrp_id; 759 klgrpset_clear(changed); 760 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 761 lgrpid)) 762 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 763 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 764 &changed); 765 766 /* 767 * May have added new intermediate lgroups, so need to add 768 * resources other than CPUs which are added below 769 */ 770 (void) lgrp_mnode_update(changed, NULL); 771 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 772 my_lgrp->lgrp_id)) { 773 int i; 774 775 /* 776 * Update existing lgroup and lgroups containing it with CPU 777 * resource 778 */ 779 lgrpid = my_lgrp->lgrp_id; 780 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 781 for (i = 0; i <= lgrp_alloc_max; i++) { 782 lgrp_t *lgrp; 783 784 lgrp = lgrp_table[i]; 785 if (!LGRP_EXISTS(lgrp) || 786 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 787 continue; 788 789 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 790 } 791 } 792 793 lgrpid = my_lgrp->lgrp_id; 794 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 795 796 /* 797 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 798 * end up in lpl for lgroup 0 whether it is supposed to be in there or 799 * not since none of lgroup IDs in the lpl's have been set yet. 800 */ 801 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 802 cp->cpu_lpl->lpl_lgrpid = lgrpid; 803 804 /* 805 * link the CPU into the lgrp's CPU list 806 */ 807 if (my_lgrp->lgrp_cpucnt == 0) { 808 my_lgrp->lgrp_cpu = cp; 809 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 810 } else { 811 cptr = my_lgrp->lgrp_cpu; 812 cp->cpu_next_lgrp = cptr; 813 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 814 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 815 cptr->cpu_prev_lgrp = cp; 816 } 817 my_lgrp->lgrp_cpucnt++; 818 } 819 820 lgrp_t * 821 lgrp_create(void) 822 { 823 lgrp_t *my_lgrp; 824 lgrp_id_t lgrpid; 825 int i; 826 827 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 828 829 /* 830 * Find an open slot in the lgroup table and recycle unused lgroup 831 * left there if any 832 */ 833 my_lgrp = NULL; 834 if (lgrp_alloc_hint == -1) 835 /* 836 * Allocate from end when hint not set yet because no lgroups 837 * have been deleted yet 838 */ 839 lgrpid = nlgrps++; 840 else { 841 /* 842 * Start looking for next open slot from hint and leave hint 843 * at slot allocated 844 */ 845 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 846 my_lgrp = lgrp_table[i]; 847 if (!LGRP_EXISTS(my_lgrp)) { 848 lgrpid = i; 849 nlgrps++; 850 break; 851 } 852 } 853 lgrp_alloc_hint = lgrpid; 854 } 855 856 /* 857 * Keep track of max lgroup ID allocated so far to cut down on searches 858 */ 859 if (lgrpid > lgrp_alloc_max) 860 lgrp_alloc_max = lgrpid; 861 862 /* 863 * Need to allocate new lgroup if next open slot didn't have one 864 * for recycling 865 */ 866 if (my_lgrp == NULL) 867 my_lgrp = lgrp_plat_alloc(lgrpid); 868 869 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 870 panic("Too many lgrps for platform (%d)", nlgrps); 871 872 my_lgrp->lgrp_id = lgrpid; 873 my_lgrp->lgrp_latency = 0; 874 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 875 my_lgrp->lgrp_parent = NULL; 876 my_lgrp->lgrp_childcnt = 0; 877 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 878 my_lgrp->lgrp_nmnodes = 0; 879 klgrpset_clear(my_lgrp->lgrp_children); 880 klgrpset_clear(my_lgrp->lgrp_leaves); 881 for (i = 0; i < LGRP_RSRC_COUNT; i++) 882 klgrpset_clear(my_lgrp->lgrp_set[i]); 883 884 my_lgrp->lgrp_cpu = NULL; 885 my_lgrp->lgrp_cpucnt = 0; 886 887 if (my_lgrp->lgrp_kstat != NULL) 888 lgrp_kstat_reset(lgrpid); 889 890 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 891 892 return (my_lgrp); 893 } 894 895 void 896 lgrp_destroy(lgrp_t *lgrp) 897 { 898 int i; 899 900 /* 901 * Unless this lgroup is being destroyed on behalf of 902 * the boot CPU, cpu_lock must be held 903 */ 904 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 905 906 if (nlgrps == 1) 907 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 908 909 if (!LGRP_EXISTS(lgrp)) 910 return; 911 912 /* 913 * Set hint to lgroup being deleted and try to keep lower numbered 914 * hints to facilitate finding empty slots 915 */ 916 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 917 lgrp_alloc_hint = lgrp->lgrp_id; 918 919 /* 920 * Mark this lgroup to be recycled by setting its lgroup ID to 921 * LGRP_NONE and clear relevant fields 922 */ 923 lgrp->lgrp_id = LGRP_NONE; 924 lgrp->lgrp_latency = 0; 925 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 926 lgrp->lgrp_parent = NULL; 927 lgrp->lgrp_childcnt = 0; 928 929 klgrpset_clear(lgrp->lgrp_children); 930 klgrpset_clear(lgrp->lgrp_leaves); 931 for (i = 0; i < LGRP_RSRC_COUNT; i++) 932 klgrpset_clear(lgrp->lgrp_set[i]); 933 934 lgrp->lgrp_mnodes = (mnodeset_t)0; 935 lgrp->lgrp_nmnodes = 0; 936 937 lgrp->lgrp_cpu = NULL; 938 lgrp->lgrp_cpucnt = 0; 939 940 nlgrps--; 941 } 942 943 /* 944 * Initialize kstat data. Called from lgrp intialization code. 945 */ 946 static void 947 lgrp_kstat_init(void) 948 { 949 lgrp_stat_t stat; 950 951 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 952 953 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 954 kstat_named_init(&lgrp_kstat_data[stat], 955 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 956 } 957 958 /* 959 * initialize an lgrp's kstats if needed 960 * called with cpu_lock held but not with cpus paused. 961 * we don't tear these down now because we don't know about 962 * memory leaving the lgrp yet... 963 */ 964 965 void 966 lgrp_kstat_create(cpu_t *cp) 967 { 968 kstat_t *lgrp_kstat; 969 lgrp_id_t lgrpid; 970 lgrp_t *my_lgrp; 971 972 ASSERT(MUTEX_HELD(&cpu_lock)); 973 974 lgrpid = cp->cpu_lpl->lpl_lgrpid; 975 my_lgrp = lgrp_table[lgrpid]; 976 977 if (my_lgrp->lgrp_kstat != NULL) 978 return; /* already initialized */ 979 980 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 981 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 982 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 983 984 if (lgrp_kstat != NULL) { 985 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 986 lgrp_kstat->ks_private = my_lgrp; 987 lgrp_kstat->ks_data = &lgrp_kstat_data; 988 lgrp_kstat->ks_update = lgrp_kstat_extract; 989 my_lgrp->lgrp_kstat = lgrp_kstat; 990 kstat_install(lgrp_kstat); 991 } 992 } 993 994 /* 995 * this will do something when we manage to remove now unused lgrps 996 */ 997 998 /* ARGSUSED */ 999 void 1000 lgrp_kstat_destroy(cpu_t *cp) 1001 { 1002 ASSERT(MUTEX_HELD(&cpu_lock)); 1003 } 1004 1005 /* 1006 * Called when a CPU is off-lined. 1007 */ 1008 static void 1009 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 1010 { 1011 lgrp_t *my_lgrp; 1012 struct cpu *prev; 1013 struct cpu *next; 1014 1015 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 1016 1017 prev = cp->cpu_prev_lgrp; 1018 next = cp->cpu_next_lgrp; 1019 1020 prev->cpu_next_lgrp = next; 1021 next->cpu_prev_lgrp = prev; 1022 1023 /* 1024 * just because I'm paranoid doesn't mean... 1025 */ 1026 1027 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 1028 1029 my_lgrp = lgrp_table[lgrpid]; 1030 my_lgrp->lgrp_cpucnt--; 1031 1032 /* 1033 * Removing last CPU in lgroup, so update lgroup topology 1034 */ 1035 if (my_lgrp->lgrp_cpucnt == 0) { 1036 klgrpset_t changed; 1037 int count; 1038 int i; 1039 1040 my_lgrp->lgrp_cpu = NULL; 1041 1042 /* 1043 * Remove this lgroup from its lgroup CPU resources and remove 1044 * lgroup from lgroup topology if it doesn't have any more 1045 * resources in it now 1046 */ 1047 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1048 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1049 count = 0; 1050 klgrpset_clear(changed); 1051 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1052 lgrp_alloc_max + 1, &changed); 1053 return; 1054 } 1055 1056 /* 1057 * This lgroup isn't empty, so just remove it from CPU 1058 * resources of any lgroups that contain it as such 1059 */ 1060 for (i = 0; i <= lgrp_alloc_max; i++) { 1061 lgrp_t *lgrp; 1062 1063 lgrp = lgrp_table[i]; 1064 if (!LGRP_EXISTS(lgrp) || 1065 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1066 lgrpid)) 1067 continue; 1068 1069 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1070 } 1071 return; 1072 } 1073 1074 if (my_lgrp->lgrp_cpu == cp) 1075 my_lgrp->lgrp_cpu = next; 1076 1077 } 1078 1079 /* 1080 * Update memory nodes in target lgroups and return ones that get changed 1081 */ 1082 int 1083 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1084 { 1085 int count; 1086 int i; 1087 int j; 1088 lgrp_t *lgrp; 1089 lgrp_t *lgrp_rsrc; 1090 1091 count = 0; 1092 if (changed) 1093 klgrpset_clear(*changed); 1094 1095 if (klgrpset_isempty(target)) 1096 return (0); 1097 1098 /* 1099 * Find each lgroup in target lgroups 1100 */ 1101 for (i = 0; i <= lgrp_alloc_max; i++) { 1102 /* 1103 * Skip any lgroups that don't exist or aren't in target group 1104 */ 1105 lgrp = lgrp_table[i]; 1106 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1107 continue; 1108 } 1109 1110 /* 1111 * Initialize memnodes for intermediate lgroups to 0 1112 * and update them from scratch since they may have completely 1113 * changed 1114 */ 1115 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1116 lgrp->lgrp_mnodes = (mnodeset_t)0; 1117 lgrp->lgrp_nmnodes = 0; 1118 } 1119 1120 /* 1121 * Update memory nodes of of target lgroup with memory nodes 1122 * from each lgroup in its lgroup memory resource set 1123 */ 1124 for (j = 0; j <= lgrp_alloc_max; j++) { 1125 int k; 1126 1127 /* 1128 * Skip any lgroups that don't exist or aren't in 1129 * memory resources of target lgroup 1130 */ 1131 lgrp_rsrc = lgrp_table[j]; 1132 if (!LGRP_EXISTS(lgrp_rsrc) || 1133 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1134 j)) 1135 continue; 1136 1137 /* 1138 * Update target lgroup's memnodes to include memnodes 1139 * of this lgroup 1140 */ 1141 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1142 mnodeset_t mnode_mask; 1143 1144 mnode_mask = (mnodeset_t)1 << k; 1145 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1146 !(lgrp->lgrp_mnodes & mnode_mask)) { 1147 lgrp->lgrp_mnodes |= mnode_mask; 1148 lgrp->lgrp_nmnodes++; 1149 } 1150 } 1151 count++; 1152 if (changed) 1153 klgrpset_add(*changed, lgrp->lgrp_id); 1154 } 1155 } 1156 1157 return (count); 1158 } 1159 1160 /* 1161 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1162 * is moved from one board to another. The "from" and "to" arguments specify the 1163 * source and the destination of the move. 1164 * 1165 * See plat_lgrp_config() for a detailed description of the copy-rename 1166 * semantics. 1167 * 1168 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1169 * the lgroup topology which is changing as memory moves from one lgroup to 1170 * another. It removes the mnode from the source lgroup and re-inserts it in the 1171 * target lgroup. 1172 * 1173 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1174 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1175 * copy-rename operation. 1176 * 1177 * There is one case which requires special handling. If the system contains 1178 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1179 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1180 * lgrp_mem_init), but there is a window when the system has no memory in the 1181 * lgroup hierarchy. If another thread tries to allocate memory during this 1182 * window, the allocation will fail, although the system has physical memory. 1183 * This may cause a system panic or a deadlock (some sleeping memory allocations 1184 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1185 * the mnode back). 1186 * 1187 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1188 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1189 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1190 * but it updates the rest of the lgroup topology as if the mnode was actually 1191 * removed. The lgrp_mem_init() function recognizes that the mnode being 1192 * inserted represents such a special case and updates the topology 1193 * appropriately. 1194 */ 1195 void 1196 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1197 { 1198 /* 1199 * Remove the memory from the source node and add it to the destination 1200 * node. 1201 */ 1202 lgrp_mem_fini(mnode, from, B_TRUE); 1203 lgrp_mem_init(mnode, to, B_TRUE); 1204 } 1205 1206 /* 1207 * Called to indicate that the lgrp with platform handle "hand" now 1208 * contains the memory identified by "mnode". 1209 * 1210 * LOCKING for this routine is a bit tricky. Usually it is called without 1211 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1212 * callers. During DR of the board containing the caged memory it may be called 1213 * with cpu_lock already held and CPUs paused. 1214 * 1215 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1216 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1217 * dealing with the special case of DR copy-rename described in 1218 * lgrp_mem_rename(). 1219 */ 1220 void 1221 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1222 { 1223 klgrpset_t changed; 1224 int count; 1225 int i; 1226 lgrp_t *my_lgrp; 1227 lgrp_id_t lgrpid; 1228 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1229 boolean_t drop_lock = B_FALSE; 1230 boolean_t need_synch = B_FALSE; 1231 1232 /* 1233 * Grab CPU lock (if we haven't already) 1234 */ 1235 if (!MUTEX_HELD(&cpu_lock)) { 1236 mutex_enter(&cpu_lock); 1237 drop_lock = B_TRUE; 1238 } 1239 1240 /* 1241 * This routine may be called from a context where we already 1242 * hold cpu_lock, and have already paused cpus. 1243 */ 1244 if (!cpus_paused()) 1245 need_synch = B_TRUE; 1246 1247 /* 1248 * Check if this mnode is already configured and return immediately if 1249 * it is. 1250 * 1251 * NOTE: in special case of copy-rename of the only remaining mnode, 1252 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1253 * recognize this case and continue as usual, but skip the update to 1254 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1255 * in topology, temporarily introduced by lgrp_mem_fini(). 1256 */ 1257 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1258 lgrp_root->lgrp_mnodes & mnodes_mask) { 1259 if (drop_lock) 1260 mutex_exit(&cpu_lock); 1261 return; 1262 } 1263 1264 /* 1265 * Update lgroup topology with new memory resources, keeping track of 1266 * which lgroups change 1267 */ 1268 count = 0; 1269 klgrpset_clear(changed); 1270 my_lgrp = lgrp_hand_to_lgrp(hand); 1271 if (my_lgrp == NULL) { 1272 /* new lgrp */ 1273 my_lgrp = lgrp_create(); 1274 lgrpid = my_lgrp->lgrp_id; 1275 my_lgrp->lgrp_plathand = hand; 1276 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1277 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1278 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1279 1280 if (need_synch) 1281 pause_cpus(NULL); 1282 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1283 &changed); 1284 if (need_synch) 1285 start_cpus(); 1286 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1287 > 0) { 1288 /* 1289 * Leaf lgroup was created, but latency wasn't available 1290 * then. So, set latency for it and fill in rest of lgroup 1291 * topology now that we know how far it is from other leaf 1292 * lgroups. 1293 */ 1294 klgrpset_clear(changed); 1295 lgrpid = my_lgrp->lgrp_id; 1296 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1297 lgrpid)) 1298 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1299 if (need_synch) 1300 pause_cpus(NULL); 1301 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1302 &changed); 1303 if (need_synch) 1304 start_cpus(); 1305 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1306 my_lgrp->lgrp_id)) { 1307 /* 1308 * Add new lgroup memory resource to existing lgroup 1309 */ 1310 lgrpid = my_lgrp->lgrp_id; 1311 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1312 klgrpset_add(changed, lgrpid); 1313 count++; 1314 for (i = 0; i <= lgrp_alloc_max; i++) { 1315 lgrp_t *lgrp; 1316 1317 lgrp = lgrp_table[i]; 1318 if (!LGRP_EXISTS(lgrp) || 1319 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1320 continue; 1321 1322 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1323 klgrpset_add(changed, lgrp->lgrp_id); 1324 count++; 1325 } 1326 } 1327 1328 /* 1329 * Add memory node to lgroup and remove lgroup from ones that need 1330 * to be updated 1331 */ 1332 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1333 my_lgrp->lgrp_mnodes |= mnodes_mask; 1334 my_lgrp->lgrp_nmnodes++; 1335 } 1336 klgrpset_del(changed, lgrpid); 1337 1338 /* 1339 * Update memory node information for all lgroups that changed and 1340 * contain new memory node as a resource 1341 */ 1342 if (count) 1343 (void) lgrp_mnode_update(changed, NULL); 1344 1345 if (drop_lock) 1346 mutex_exit(&cpu_lock); 1347 } 1348 1349 /* 1350 * Called to indicate that the lgroup associated with the platform 1351 * handle "hand" no longer contains given memory node 1352 * 1353 * LOCKING for this routine is a bit tricky. Usually it is called without 1354 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1355 * callers. During DR of the board containing the caged memory it may be called 1356 * with cpu_lock already held and CPUs paused. 1357 * 1358 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1359 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1360 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1361 * the same mnode back into the topology. See lgrp_mem_rename() and 1362 * lgrp_mem_init() for additional details. 1363 */ 1364 void 1365 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1366 { 1367 klgrpset_t changed; 1368 int count; 1369 int i; 1370 lgrp_t *my_lgrp; 1371 lgrp_id_t lgrpid; 1372 mnodeset_t mnodes_mask; 1373 boolean_t drop_lock = B_FALSE; 1374 boolean_t need_synch = B_FALSE; 1375 1376 /* 1377 * Grab CPU lock (if we haven't already) 1378 */ 1379 if (!MUTEX_HELD(&cpu_lock)) { 1380 mutex_enter(&cpu_lock); 1381 drop_lock = B_TRUE; 1382 } 1383 1384 /* 1385 * This routine may be called from a context where we already 1386 * hold cpu_lock and have already paused cpus. 1387 */ 1388 if (!cpus_paused()) 1389 need_synch = B_TRUE; 1390 1391 my_lgrp = lgrp_hand_to_lgrp(hand); 1392 1393 /* 1394 * The lgrp *must* be pre-existing 1395 */ 1396 ASSERT(my_lgrp != NULL); 1397 1398 /* 1399 * Delete memory node from lgroups which contain it 1400 */ 1401 mnodes_mask = ((mnodeset_t)1 << mnode); 1402 for (i = 0; i <= lgrp_alloc_max; i++) { 1403 lgrp_t *lgrp = lgrp_table[i]; 1404 /* 1405 * Skip any non-existent lgroups and any lgroups that don't 1406 * contain leaf lgroup of memory as a memory resource 1407 */ 1408 if (!LGRP_EXISTS(lgrp) || 1409 !(lgrp->lgrp_mnodes & mnodes_mask)) 1410 continue; 1411 1412 /* 1413 * Avoid removing the last mnode from the root in the DR 1414 * copy-rename case. See lgrp_mem_rename() for details. 1415 */ 1416 if (is_copy_rename && 1417 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1418 continue; 1419 1420 /* 1421 * Remove memory node from lgroup. 1422 */ 1423 lgrp->lgrp_mnodes &= ~mnodes_mask; 1424 lgrp->lgrp_nmnodes--; 1425 ASSERT(lgrp->lgrp_nmnodes >= 0); 1426 } 1427 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1428 1429 /* 1430 * Don't need to update lgroup topology if this lgroup still has memory. 1431 * 1432 * In the special case of DR copy-rename with the only mnode being 1433 * removed, the lgrp_mnodes for the root is always non-zero, but we 1434 * still need to update the lgroup topology. 1435 */ 1436 if ((my_lgrp->lgrp_nmnodes > 0) && 1437 !(is_copy_rename && (my_lgrp == lgrp_root) && 1438 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1439 if (drop_lock) 1440 mutex_exit(&cpu_lock); 1441 return; 1442 } 1443 1444 /* 1445 * This lgroup does not contain any memory now 1446 */ 1447 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1448 1449 /* 1450 * Remove this lgroup from lgroup topology if it does not contain any 1451 * resources now 1452 */ 1453 lgrpid = my_lgrp->lgrp_id; 1454 count = 0; 1455 klgrpset_clear(changed); 1456 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1457 /* 1458 * Delete lgroup when no more resources 1459 */ 1460 if (need_synch) 1461 pause_cpus(NULL); 1462 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1463 lgrp_alloc_max + 1, &changed); 1464 ASSERT(count > 0); 1465 if (need_synch) 1466 start_cpus(); 1467 } else { 1468 /* 1469 * Remove lgroup from memory resources of any lgroups that 1470 * contain it as such 1471 */ 1472 for (i = 0; i <= lgrp_alloc_max; i++) { 1473 lgrp_t *lgrp; 1474 1475 lgrp = lgrp_table[i]; 1476 if (!LGRP_EXISTS(lgrp) || 1477 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1478 lgrpid)) 1479 continue; 1480 1481 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1482 } 1483 } 1484 if (drop_lock) 1485 mutex_exit(&cpu_lock); 1486 } 1487 1488 /* 1489 * Return lgroup with given platform handle 1490 */ 1491 lgrp_t * 1492 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1493 { 1494 int i; 1495 lgrp_t *lgrp; 1496 1497 if (hand == LGRP_NULL_HANDLE) 1498 return (NULL); 1499 1500 for (i = 0; i <= lgrp_alloc_max; i++) { 1501 lgrp = lgrp_table[i]; 1502 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1503 return (lgrp); 1504 } 1505 return (NULL); 1506 } 1507 1508 /* 1509 * Return the home lgroup of the current thread. 1510 * We must do this with kernel preemption disabled, since we don't want our 1511 * thread to be re-homed while we're poking around with its lpl, and the lpl 1512 * should never be NULL. 1513 * 1514 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1515 * is enabled because of DR. Callers can use disable kernel preemption 1516 * around this call to guarantee that the lgroup will be valid beyond this 1517 * routine, since kernel preemption can be recursive. 1518 */ 1519 lgrp_t * 1520 lgrp_home_lgrp(void) 1521 { 1522 lgrp_t *lgrp; 1523 lpl_t *lpl; 1524 1525 kpreempt_disable(); 1526 1527 lpl = curthread->t_lpl; 1528 ASSERT(lpl != NULL); 1529 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1530 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1531 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1532 1533 kpreempt_enable(); 1534 1535 return (lgrp); 1536 } 1537 1538 /* 1539 * Return ID of home lgroup for given thread 1540 * (See comments for lgrp_home_lgrp() for special care and handling 1541 * instructions) 1542 */ 1543 lgrp_id_t 1544 lgrp_home_id(kthread_t *t) 1545 { 1546 lgrp_id_t lgrp; 1547 lpl_t *lpl; 1548 1549 ASSERT(t != NULL); 1550 /* 1551 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1552 * cannot since the HAT layer can call into this routine to 1553 * determine the locality for its data structures in the context 1554 * of a page fault. 1555 */ 1556 1557 kpreempt_disable(); 1558 1559 lpl = t->t_lpl; 1560 ASSERT(lpl != NULL); 1561 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1562 lgrp = lpl->lpl_lgrpid; 1563 1564 kpreempt_enable(); 1565 1566 return (lgrp); 1567 } 1568 1569 /* 1570 * Return lgroup containing the physical memory for the given page frame number 1571 */ 1572 lgrp_t * 1573 lgrp_pfn_to_lgrp(pfn_t pfn) 1574 { 1575 lgrp_handle_t hand; 1576 int i; 1577 lgrp_t *lgrp; 1578 1579 hand = lgrp_plat_pfn_to_hand(pfn); 1580 if (hand != LGRP_NULL_HANDLE) 1581 for (i = 0; i <= lgrp_alloc_max; i++) { 1582 lgrp = lgrp_table[i]; 1583 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1584 return (lgrp); 1585 } 1586 return (NULL); 1587 } 1588 1589 /* 1590 * Return lgroup containing the physical memory for the given page frame number 1591 */ 1592 lgrp_t * 1593 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1594 { 1595 lgrp_handle_t hand; 1596 int i; 1597 lgrp_t *lgrp; 1598 pfn_t pfn; 1599 1600 pfn = btop(physaddr); 1601 hand = lgrp_plat_pfn_to_hand(pfn); 1602 if (hand != LGRP_NULL_HANDLE) 1603 for (i = 0; i <= lgrp_alloc_max; i++) { 1604 lgrp = lgrp_table[i]; 1605 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1606 return (lgrp); 1607 } 1608 return (NULL); 1609 } 1610 1611 /* 1612 * Return the leaf lgroup containing the given CPU 1613 * 1614 * The caller needs to take precautions necessary to prevent 1615 * "cpu", and it's lpl from going away across a call to this function. 1616 * hint: kpreempt_disable()/kpreempt_enable() 1617 */ 1618 static lgrp_t * 1619 lgrp_cpu_to_lgrp(cpu_t *cpu) 1620 { 1621 return (cpu->cpu_lpl->lpl_lgrp); 1622 } 1623 1624 /* 1625 * Return the sum of the partition loads in an lgrp divided by 1626 * the number of CPUs in the lgrp. This is our best approximation 1627 * of an 'lgroup load average' for a useful per-lgroup kstat. 1628 */ 1629 static uint64_t 1630 lgrp_sum_loadavgs(lgrp_t *lgrp) 1631 { 1632 cpu_t *cpu; 1633 int ncpu; 1634 uint64_t loads = 0; 1635 1636 mutex_enter(&cpu_lock); 1637 1638 cpu = lgrp->lgrp_cpu; 1639 ncpu = lgrp->lgrp_cpucnt; 1640 1641 if (cpu == NULL || ncpu == 0) { 1642 mutex_exit(&cpu_lock); 1643 return (0ull); 1644 } 1645 1646 do { 1647 loads += cpu->cpu_lpl->lpl_loadavg; 1648 cpu = cpu->cpu_next_lgrp; 1649 } while (cpu != lgrp->lgrp_cpu); 1650 1651 mutex_exit(&cpu_lock); 1652 1653 return (loads / ncpu); 1654 } 1655 1656 void 1657 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1658 { 1659 struct lgrp_stats *pstats; 1660 1661 /* 1662 * Verify that the caller isn't trying to add to 1663 * a statistic for an lgroup that has gone away 1664 */ 1665 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1666 return; 1667 1668 pstats = &lgrp_stats[lgrpid]; 1669 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1670 } 1671 1672 int64_t 1673 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1674 { 1675 uint64_t val; 1676 struct lgrp_stats *pstats; 1677 1678 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1679 return ((int64_t)0); 1680 1681 pstats = &lgrp_stats[lgrpid]; 1682 LGRP_STAT_READ(pstats, stat, val); 1683 return (val); 1684 } 1685 1686 /* 1687 * Reset all kstats for lgrp specified by its lgrpid. 1688 */ 1689 static void 1690 lgrp_kstat_reset(lgrp_id_t lgrpid) 1691 { 1692 lgrp_stat_t stat; 1693 1694 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1695 return; 1696 1697 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1698 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1699 } 1700 } 1701 1702 /* 1703 * Collect all per-lgrp statistics for the lgrp associated with this 1704 * kstat, and store them in the ks_data array. 1705 * 1706 * The superuser can reset all the running counter statistics for an 1707 * lgrp by writing to any of the lgrp's stats. 1708 */ 1709 static int 1710 lgrp_kstat_extract(kstat_t *ksp, int rw) 1711 { 1712 lgrp_stat_t stat; 1713 struct kstat_named *ksd; 1714 lgrp_t *lgrp; 1715 lgrp_id_t lgrpid; 1716 1717 lgrp = (lgrp_t *)ksp->ks_private; 1718 1719 ksd = (struct kstat_named *)ksp->ks_data; 1720 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1721 1722 lgrpid = lgrp->lgrp_id; 1723 1724 if (lgrpid == LGRP_NONE) { 1725 /* 1726 * Return all zeroes as stats for freed lgrp. 1727 */ 1728 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1729 ksd[stat].value.i64 = 0; 1730 } 1731 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1732 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1733 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1734 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1735 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1736 } else if (rw != KSTAT_WRITE) { 1737 /* 1738 * Handle counter stats 1739 */ 1740 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1741 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1742 } 1743 1744 /* 1745 * Handle kernel data snapshot stats 1746 */ 1747 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1748 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1749 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1750 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1751 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1752 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1753 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1754 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1755 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 = 1756 lgrp_loadavg_max_effect; 1757 } else { 1758 lgrp_kstat_reset(lgrpid); 1759 } 1760 1761 return (0); 1762 } 1763 1764 int 1765 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1766 { 1767 cpu_t *cp; 1768 1769 mutex_enter(&cpu_lock); 1770 1771 if ((cp = cpu_get(id)) == NULL) { 1772 mutex_exit(&cpu_lock); 1773 return (EINVAL); 1774 } 1775 1776 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1777 mutex_exit(&cpu_lock); 1778 return (EINVAL); 1779 } 1780 1781 ASSERT(cp->cpu_lpl != NULL); 1782 1783 *lp = cp->cpu_lpl->lpl_lgrpid; 1784 1785 mutex_exit(&cpu_lock); 1786 1787 return (0); 1788 } 1789 1790 int 1791 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1792 { 1793 cpu_t *cp; 1794 1795 mutex_enter(&cpu_lock); 1796 1797 if ((cp = cpu_get(id)) == NULL) { 1798 mutex_exit(&cpu_lock); 1799 return (EINVAL); 1800 } 1801 1802 ASSERT(cp->cpu_lpl != NULL); 1803 1804 *lp = cp->cpu_lpl->lpl_loadavg; 1805 1806 mutex_exit(&cpu_lock); 1807 1808 return (0); 1809 } 1810 1811 /* 1812 * Add a resource named by lpl_leaf to rset of lpl_target 1813 * 1814 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1815 * resource. It is adjusted here, as this is presently the only place that we 1816 * can be certain a resource addition has succeeded. 1817 * 1818 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1819 * list in order until it reaches a NULL. (This list is required to be NULL 1820 * terminated, too). This is done so that we can mark start pos + 1, so that 1821 * each lpl is traversed sequentially, but in a different order. We hope this 1822 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1823 */ 1824 1825 void 1826 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1827 { 1828 int i; 1829 int entry_slot = 0; 1830 1831 /* return if leaf is already present */ 1832 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1833 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1834 return; 1835 } 1836 1837 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1838 lpl_leaf->lpl_lgrpid) { 1839 break; 1840 } 1841 } 1842 1843 /* insert leaf, update counts */ 1844 entry_slot = i; 1845 i = lpl_target->lpl_nrset++; 1846 1847 /* 1848 * Start at the end of the rset array and work backwards towards the 1849 * slot into which the new lpl will be inserted. This effectively 1850 * preserves the current ordering by scooting everybody over one entry, 1851 * and placing the new entry into the space created. 1852 */ 1853 while (i-- > entry_slot) { 1854 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1855 lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] = 1856 i + 1; 1857 } 1858 1859 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1860 lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot; 1861 1862 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1863 } 1864 1865 /* 1866 * Update each of lpl_parent's children with a reference to their parent. 1867 * The lgrp topology is used as the reference since it is fully 1868 * consistent and correct at this point. 1869 * This should be called after any potential change in lpl_parent's 1870 * rset. 1871 */ 1872 static void 1873 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1874 { 1875 klgrpset_t children; 1876 int i; 1877 1878 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1879 if (klgrpset_isempty(children)) 1880 return; /* nothing to do */ 1881 1882 for (i = 0; i <= lgrp_alloc_max; i++) { 1883 if (klgrpset_ismember(children, i)) { 1884 /* 1885 * (Re)set the parent. It may be incorrect if 1886 * lpl_parent is new in the topology. 1887 */ 1888 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1889 } 1890 } 1891 } 1892 1893 /* 1894 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1895 * 1896 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1897 * resource. The values are adjusted here, as this is the only place that we can 1898 * be certain a resource was successfully deleted. 1899 */ 1900 void 1901 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1902 { 1903 int i; 1904 lpl_t *leaf; 1905 1906 if (lpl_target->lpl_nrset == 0) 1907 return; 1908 1909 /* find leaf in intermediate node */ 1910 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1911 if (lpl_target->lpl_rset[i] == lpl_leaf) 1912 break; 1913 } 1914 1915 /* return if leaf not found */ 1916 if (lpl_target->lpl_rset[i] != lpl_leaf) 1917 return; 1918 1919 /* prune leaf, compress array */ 1920 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1921 lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1; 1922 lpl_target->lpl_ncpu--; 1923 do { 1924 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1925 /* 1926 * Update the lgrp id <=> rset mapping 1927 */ 1928 if ((leaf = lpl_target->lpl_rset[i]) != NULL) { 1929 lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i; 1930 } 1931 } while (i++ < lpl_target->lpl_nrset); 1932 } 1933 1934 /* 1935 * Check to see if the resource set of the target lpl contains the 1936 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1937 */ 1938 1939 int 1940 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1941 { 1942 int i; 1943 1944 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1945 if (lpl_target->lpl_rset[i] == lpl_leaf) 1946 return (1); 1947 } 1948 1949 return (0); 1950 } 1951 1952 /* 1953 * Called when we change cpu lpl membership. This increments or decrements the 1954 * per-cpu counter in every lpl in which our leaf appears. 1955 */ 1956 void 1957 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 1958 { 1959 cpupart_t *cpupart; 1960 lgrp_t *lgrp_leaf; 1961 lgrp_t *lgrp_cur; 1962 lpl_t *lpl_leaf; 1963 lpl_t *lpl_cur; 1964 int i; 1965 1966 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 1967 1968 cpupart = cp->cpu_part; 1969 lpl_leaf = cp->cpu_lpl; 1970 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 1971 1972 for (i = 0; i <= lgrp_alloc_max; i++) { 1973 lgrp_cur = lgrp_table[i]; 1974 1975 /* 1976 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 1977 * for the cpu in question, or if the current lgrp and leaf 1978 * don't share the same resources. 1979 */ 1980 1981 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 1982 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 1983 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 1984 continue; 1985 1986 1987 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 1988 1989 if (lpl_cur->lpl_nrset > 0) { 1990 if (act == LPL_INCREMENT) { 1991 lpl_cur->lpl_ncpu++; 1992 } else if (act == LPL_DECREMENT) { 1993 lpl_cur->lpl_ncpu--; 1994 } 1995 } 1996 } 1997 } 1998 1999 /* 2000 * Initialize lpl with given resources and specified lgrp 2001 */ 2002 void 2003 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 2004 { 2005 lpl->lpl_lgrpid = lgrp->lgrp_id; 2006 lpl->lpl_loadavg = 0; 2007 if (lpl == lpl_leaf) 2008 lpl->lpl_ncpu = 1; 2009 else 2010 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2011 lpl->lpl_nrset = 1; 2012 lpl->lpl_rset[0] = lpl_leaf; 2013 lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0; 2014 lpl->lpl_lgrp = lgrp; 2015 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2016 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2017 } 2018 2019 /* 2020 * Clear an unused lpl 2021 */ 2022 void 2023 lpl_clear(lpl_t *lpl) 2024 { 2025 /* 2026 * Clear out all fields in the lpl except: 2027 * lpl_lgrpid - to facilitate debugging 2028 * lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size 2029 * 2030 * Note that the lpl's rset and id2rset mapping are cleared as well. 2031 */ 2032 lpl->lpl_loadavg = 0; 2033 lpl->lpl_ncpu = 0; 2034 lpl->lpl_lgrp = NULL; 2035 lpl->lpl_parent = NULL; 2036 lpl->lpl_cpus = NULL; 2037 lpl->lpl_nrset = 0; 2038 lpl->lpl_homed_time = 0; 2039 bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz); 2040 bzero(lpl->lpl_id2rset, 2041 sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz); 2042 } 2043 2044 /* 2045 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2046 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2047 * make full use of all of the lgroup topology, but this checks to make sure 2048 * that for the parts that it does use, it has correctly understood the 2049 * relationships that exist. This function returns 2050 * 0 if the topology is correct, and a non-zero error code, for non-debug 2051 * kernels if incorrect. Asserts are spread throughout the code to aid in 2052 * debugging on a DEBUG kernel. 2053 */ 2054 int 2055 lpl_topo_verify(cpupart_t *cpupart) 2056 { 2057 lgrp_t *lgrp; 2058 lpl_t *lpl; 2059 klgrpset_t rset; 2060 klgrpset_t cset; 2061 cpu_t *cpu; 2062 cpu_t *cp_start; 2063 int i; 2064 int j; 2065 int sum; 2066 2067 /* topology can't be incorrect if it doesn't exist */ 2068 if (!lgrp_topo_initialized || !lgrp_initialized) 2069 return (LPL_TOPO_CORRECT); 2070 2071 ASSERT(cpupart != NULL); 2072 2073 for (i = 0; i <= lgrp_alloc_max; i++) { 2074 lgrp = lgrp_table[i]; 2075 lpl = NULL; 2076 /* make sure lpls are allocated */ 2077 ASSERT(cpupart->cp_lgrploads); 2078 if (!cpupart->cp_lgrploads) 2079 return (LPL_TOPO_PART_HAS_NO_LPL); 2080 2081 lpl = &cpupart->cp_lgrploads[i]; 2082 /* make sure our index is good */ 2083 ASSERT(i < cpupart->cp_nlgrploads); 2084 2085 /* if lgroup doesn't exist, make sure lpl is empty */ 2086 if (!LGRP_EXISTS(lgrp)) { 2087 ASSERT(lpl->lpl_ncpu == 0); 2088 if (lpl->lpl_ncpu > 0) { 2089 return (LPL_TOPO_CPUS_NOT_EMPTY); 2090 } else { 2091 continue; 2092 } 2093 } 2094 2095 /* verify that lgroup and lpl are identically numbered */ 2096 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2097 2098 /* if lgroup isn't in our partition, make sure lpl is empty */ 2099 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2100 cpupart->cp_lgrpset)) { 2101 ASSERT(lpl->lpl_ncpu == 0); 2102 if (lpl->lpl_ncpu > 0) { 2103 return (LPL_TOPO_CPUS_NOT_EMPTY); 2104 } 2105 /* 2106 * lpl is empty, and lgroup isn't in partition. verify 2107 * that lpl doesn't show up in anyone else's rsets (in 2108 * this partition, anyway) 2109 */ 2110 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2111 lpl_t *i_lpl; /* lpl we're iterating over */ 2112 2113 i_lpl = &cpupart->cp_lgrploads[j]; 2114 2115 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2116 if (lpl_rset_contains(i_lpl, lpl)) { 2117 return (LPL_TOPO_LPL_ORPHANED); 2118 } 2119 } 2120 /* lgroup is empty, and everything is ok. continue */ 2121 continue; 2122 } 2123 2124 2125 /* lgroup is in this partition, now check it against lpl */ 2126 2127 /* do both have matching lgrps? */ 2128 ASSERT(lgrp == lpl->lpl_lgrp); 2129 if (lgrp != lpl->lpl_lgrp) { 2130 return (LPL_TOPO_LGRP_MISMATCH); 2131 } 2132 2133 /* do the parent lgroups exist and do they match? */ 2134 if (lgrp->lgrp_parent) { 2135 ASSERT(lpl->lpl_parent); 2136 ASSERT(lgrp->lgrp_parent->lgrp_id == 2137 lpl->lpl_parent->lpl_lgrpid); 2138 2139 if (!lpl->lpl_parent) { 2140 return (LPL_TOPO_MISSING_PARENT); 2141 } else if (lgrp->lgrp_parent->lgrp_id != 2142 lpl->lpl_parent->lpl_lgrpid) { 2143 return (LPL_TOPO_PARENT_MISMATCH); 2144 } 2145 } 2146 2147 /* only leaf lgroups keep a cpucnt, only check leaves */ 2148 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2149 2150 /* verify that lgrp is also a leaf */ 2151 ASSERT((lgrp->lgrp_childcnt == 0) && 2152 (klgrpset_ismember(lgrp->lgrp_leaves, 2153 lpl->lpl_lgrpid))); 2154 2155 if ((lgrp->lgrp_childcnt > 0) || 2156 (!klgrpset_ismember(lgrp->lgrp_leaves, 2157 lpl->lpl_lgrpid))) { 2158 return (LPL_TOPO_LGRP_NOT_LEAF); 2159 } 2160 2161 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2162 (lpl->lpl_ncpu > 0)); 2163 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2164 (lpl->lpl_ncpu <= 0)) { 2165 return (LPL_TOPO_BAD_CPUCNT); 2166 } 2167 2168 /* 2169 * Check that lpl_ncpu also matches the number of 2170 * cpus in the lpl's linked list. This only exists in 2171 * leaves, but they should always match. 2172 */ 2173 j = 0; 2174 cpu = cp_start = lpl->lpl_cpus; 2175 while (cpu != NULL) { 2176 j++; 2177 2178 /* check to make sure cpu's lpl is leaf lpl */ 2179 ASSERT(cpu->cpu_lpl == lpl); 2180 if (cpu->cpu_lpl != lpl) { 2181 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2182 } 2183 2184 /* check next cpu */ 2185 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2186 continue; 2187 } else { 2188 cpu = NULL; 2189 } 2190 } 2191 2192 ASSERT(j == lpl->lpl_ncpu); 2193 if (j != lpl->lpl_ncpu) { 2194 return (LPL_TOPO_LPL_BAD_NCPU); 2195 } 2196 2197 /* 2198 * Also, check that leaf lpl is contained in all 2199 * intermediate lpls that name the leaf as a descendant 2200 */ 2201 for (j = 0; j <= lgrp_alloc_max; j++) { 2202 klgrpset_t intersect; 2203 lgrp_t *lgrp_cand; 2204 lpl_t *lpl_cand; 2205 2206 lgrp_cand = lgrp_table[j]; 2207 intersect = klgrpset_intersects( 2208 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2209 cpupart->cp_lgrpset); 2210 2211 if (!LGRP_EXISTS(lgrp_cand) || 2212 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2213 cpupart->cp_lgrpset) || 2214 (intersect == 0)) 2215 continue; 2216 2217 lpl_cand = 2218 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2219 2220 if (klgrpset_ismember(intersect, 2221 lgrp->lgrp_id)) { 2222 ASSERT(lpl_rset_contains(lpl_cand, 2223 lpl)); 2224 2225 if (!lpl_rset_contains(lpl_cand, lpl)) { 2226 return (LPL_TOPO_RSET_MSSNG_LF); 2227 } 2228 } 2229 } 2230 2231 } else { /* non-leaf specific checks */ 2232 2233 /* 2234 * Non-leaf lpls should have lpl_cpus == NULL 2235 * verify that this is so 2236 */ 2237 ASSERT(lpl->lpl_cpus == NULL); 2238 if (lpl->lpl_cpus != NULL) { 2239 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2240 } 2241 2242 /* 2243 * verify that the sum of the cpus in the leaf resources 2244 * is equal to the total ncpu in the intermediate 2245 */ 2246 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2247 sum += lpl->lpl_rset[j]->lpl_ncpu; 2248 } 2249 2250 ASSERT(sum == lpl->lpl_ncpu); 2251 if (sum != lpl->lpl_ncpu) { 2252 return (LPL_TOPO_LPL_BAD_NCPU); 2253 } 2254 } 2255 2256 /* 2257 * Check the rset of the lpl in question. Make sure that each 2258 * rset contains a subset of the resources in 2259 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2260 * sure that each rset doesn't include resources that are 2261 * outside of that set. (Which would be resources somehow not 2262 * accounted for). 2263 */ 2264 klgrpset_clear(rset); 2265 for (j = 0; j < lpl->lpl_nrset; j++) { 2266 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2267 } 2268 klgrpset_copy(cset, rset); 2269 /* make sure lpl rset matches lgrp rset */ 2270 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2271 /* make sure rset is contained with in partition, too */ 2272 klgrpset_diff(cset, cpupart->cp_lgrpset); 2273 2274 ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset)); 2275 if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) { 2276 return (LPL_TOPO_RSET_MISMATCH); 2277 } 2278 2279 /* 2280 * check to make sure lpl_nrset matches the number of rsets 2281 * contained in the lpl 2282 */ 2283 for (j = 0; j < lpl->lpl_nrset; j++) { 2284 if (lpl->lpl_rset[j] == NULL) 2285 break; 2286 } 2287 2288 ASSERT(j == lpl->lpl_nrset); 2289 if (j != lpl->lpl_nrset) { 2290 return (LPL_TOPO_BAD_RSETCNT); 2291 } 2292 2293 } 2294 return (LPL_TOPO_CORRECT); 2295 } 2296 2297 /* 2298 * Flatten lpl topology to given number of levels. This is presently only 2299 * implemented for a flatten to 2 levels, which will prune out the intermediates 2300 * and home the leaf lpls to the root lpl. 2301 */ 2302 int 2303 lpl_topo_flatten(int levels) 2304 { 2305 int i; 2306 uint_t sum; 2307 lgrp_t *lgrp_cur; 2308 lpl_t *lpl_cur; 2309 lpl_t *lpl_root; 2310 cpupart_t *cp; 2311 2312 if (levels != 2) 2313 return (0); 2314 2315 /* called w/ cpus paused - grab no locks! */ 2316 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2317 !lgrp_initialized); 2318 2319 cp = cp_list_head; 2320 do { 2321 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2322 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2323 2324 for (i = 0; i <= lgrp_alloc_max; i++) { 2325 lgrp_cur = lgrp_table[i]; 2326 lpl_cur = &cp->cp_lgrploads[i]; 2327 2328 if ((lgrp_cur == lgrp_root) || 2329 (!LGRP_EXISTS(lgrp_cur) && 2330 (lpl_cur->lpl_ncpu == 0))) 2331 continue; 2332 2333 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2334 /* 2335 * this should be a deleted intermediate, so 2336 * clear it 2337 */ 2338 lpl_clear(lpl_cur); 2339 } else if ((lpl_cur->lpl_nrset == 1) && 2340 (lpl_cur->lpl_rset[0] == lpl_cur) && 2341 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2342 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2343 /* 2344 * this is a leaf whose parent was deleted, or 2345 * whose parent had their lgrp deleted. (And 2346 * whose parent will soon be deleted). Point 2347 * this guy back to the root lpl. 2348 */ 2349 lpl_cur->lpl_parent = lpl_root; 2350 lpl_rset_add(lpl_root, lpl_cur); 2351 } 2352 2353 } 2354 2355 /* 2356 * Now that we're done, make sure the count on the root lpl is 2357 * correct, and update the hints of the children for the sake of 2358 * thoroughness 2359 */ 2360 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2361 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2362 } 2363 lpl_root->lpl_ncpu = sum; 2364 lpl_child_update(lpl_root, cp); 2365 2366 cp = cp->cp_next; 2367 } while (cp != cp_list_head); 2368 2369 return (levels); 2370 } 2371 2372 /* 2373 * Insert a lpl into the resource hierarchy and create any additional lpls that 2374 * are necessary to represent the varying states of locality for the cpu 2375 * resoruces newly added to the partition. 2376 * 2377 * This routine is clever enough that it can correctly add resources from the 2378 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2379 * those for which the lpl is a leaf as opposed to simply a named equally local 2380 * resource). The one special case that needs additional processing is when a 2381 * new intermediate lpl is introduced. Since the main loop only traverses 2382 * looking to add the leaf resource where it does not yet exist, additional work 2383 * is necessary to add other leaf resources that may need to exist in the newly 2384 * created intermediate. This is performed by the second inner loop, and is 2385 * only done when the check for more than one overlapping resource succeeds. 2386 */ 2387 2388 void 2389 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2390 { 2391 int i; 2392 int j; 2393 int rset_num_intersect; 2394 lgrp_t *lgrp_cur; 2395 lpl_t *lpl_cur; 2396 lpl_t *lpl_parent; 2397 lgrp_id_t parent_id; 2398 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2399 2400 for (i = 0; i <= lgrp_alloc_max; i++) { 2401 lgrp_cur = lgrp_table[i]; 2402 2403 /* 2404 * Don't insert if the lgrp isn't there, if the leaf isn't 2405 * contained within the current lgrp, or if the current lgrp has 2406 * no leaves in this partition 2407 */ 2408 2409 if (!LGRP_EXISTS(lgrp_cur) || 2410 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2411 lpl_leaf->lpl_lgrpid) || 2412 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2413 cpupart->cp_lgrpset)) 2414 continue; 2415 2416 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2417 if (lgrp_cur->lgrp_parent != NULL) { 2418 /* if lgrp has a parent, assign it properly */ 2419 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2420 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2421 } else { 2422 /* if not, make sure parent ptr gets set to null */ 2423 lpl_parent = NULL; 2424 } 2425 2426 if (lpl_cur == lpl_leaf) { 2427 /* 2428 * Almost all leaf state was initialized elsewhere. The 2429 * only thing left to do is to set the parent. 2430 */ 2431 lpl_cur->lpl_parent = lpl_parent; 2432 continue; 2433 } 2434 2435 lpl_clear(lpl_cur); 2436 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2437 2438 lpl_cur->lpl_parent = lpl_parent; 2439 2440 /* does new lpl need to be populated with other resources? */ 2441 rset_intersect = 2442 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2443 cpupart->cp_lgrpset); 2444 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2445 2446 if (rset_num_intersect > 1) { 2447 /* 2448 * If so, figure out what lpls have resources that 2449 * intersect this one, and add them. 2450 */ 2451 for (j = 0; j <= lgrp_alloc_max; j++) { 2452 lgrp_t *lgrp_cand; /* candidate lgrp */ 2453 lpl_t *lpl_cand; /* candidate lpl */ 2454 2455 lgrp_cand = lgrp_table[j]; 2456 if (!LGRP_EXISTS(lgrp_cand) || 2457 !klgrpset_ismember(rset_intersect, 2458 lgrp_cand->lgrp_id)) 2459 continue; 2460 lpl_cand = 2461 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2462 lpl_rset_add(lpl_cur, lpl_cand); 2463 } 2464 } 2465 /* 2466 * This lpl's rset has changed. Update the hint in it's 2467 * children. 2468 */ 2469 lpl_child_update(lpl_cur, cpupart); 2470 } 2471 } 2472 2473 /* 2474 * remove a lpl from the hierarchy of resources, clearing its state when 2475 * finished. If the lpls at the intermediate levels of the hierarchy have no 2476 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2477 * delete them as well. 2478 */ 2479 2480 void 2481 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2482 { 2483 int i; 2484 lgrp_t *lgrp_cur; 2485 lpl_t *lpl_cur; 2486 klgrpset_t leaf_intersect; /* intersection of leaves */ 2487 2488 for (i = 0; i <= lgrp_alloc_max; i++) { 2489 lgrp_cur = lgrp_table[i]; 2490 2491 /* 2492 * Don't attempt to remove from lgrps that aren't there, that 2493 * don't contain our leaf, or from the leaf itself. (We do that 2494 * later) 2495 */ 2496 2497 if (!LGRP_EXISTS(lgrp_cur)) 2498 continue; 2499 2500 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2501 2502 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2503 lpl_leaf->lpl_lgrpid) || 2504 (lpl_cur == lpl_leaf)) { 2505 continue; 2506 } 2507 2508 /* 2509 * This is a slightly sleazy simplification in that we have 2510 * already marked the cp_lgrpset as no longer containing the 2511 * leaf we've deleted. Any lpls that pass the above checks 2512 * based upon lgrp membership but not necessarily cpu-part 2513 * membership also get cleared by the checks below. Currently 2514 * this is harmless, as the lpls should be empty anyway. 2515 * 2516 * In particular, we want to preserve lpls that have additional 2517 * leaf resources, even though we don't yet have a processor 2518 * architecture that represents resources this way. 2519 */ 2520 2521 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2522 cpupart->cp_lgrpset); 2523 2524 lpl_rset_del(lpl_cur, lpl_leaf); 2525 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2526 lpl_clear(lpl_cur); 2527 } else { 2528 /* 2529 * Update this lpl's children 2530 */ 2531 lpl_child_update(lpl_cur, cpupart); 2532 } 2533 } 2534 lpl_clear(lpl_leaf); 2535 } 2536 2537 /* 2538 * add a cpu to a partition in terms of lgrp load avg bookeeping 2539 * 2540 * The lpl (cpu partition load average information) is now arranged in a 2541 * hierarchical fashion whereby resources that are closest, ie. most local, to 2542 * the cpu in question are considered to be leaves in a tree of resources. 2543 * There are two general cases for cpu additon: 2544 * 2545 * 1. A lpl structure that contains resources already in the hierarchy tree. 2546 * In this case, all of the associated lpl relationships have been defined, and 2547 * all that is necessary is that we link the new cpu into the per-lpl list of 2548 * cpus, and increment the ncpu count of all places where this cpu resource will 2549 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2550 * pushing is accomplished by this routine. 2551 * 2552 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2553 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2554 * construct the hierarchy of state necessary to name it's more distant 2555 * resources, if they should exist. The leaf structure is initialized by this 2556 * routine, as is the cpu-partition state for the lgrp membership. This routine 2557 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2558 * and builds all of the "ancestoral" state necessary to identify resources at 2559 * differing levels of locality. 2560 */ 2561 void 2562 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2563 { 2564 cpupart_t *cpupart; 2565 lgrp_t *lgrp_leaf; 2566 lpl_t *lpl_leaf; 2567 2568 /* called sometimes w/ cpus paused - grab no locks */ 2569 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2570 2571 cpupart = cp->cpu_part; 2572 lgrp_leaf = lgrp_table[lgrpid]; 2573 2574 /* don't add non-existent lgrp */ 2575 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2576 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2577 cp->cpu_lpl = lpl_leaf; 2578 2579 /* only leaf lpls contain cpus */ 2580 2581 if (lpl_leaf->lpl_ncpu++ == 0) { 2582 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2583 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2584 lpl_leaf_insert(lpl_leaf, cpupart); 2585 } else { 2586 /* 2587 * the lpl should already exist in the parent, so just update 2588 * the count of available CPUs 2589 */ 2590 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2591 } 2592 2593 /* link cpu into list of cpus in lpl */ 2594 2595 if (lpl_leaf->lpl_cpus) { 2596 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2597 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2598 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2599 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2600 } else { 2601 /* 2602 * We increment ncpu immediately after we create a new leaf 2603 * lpl, so assert that ncpu == 1 for the case where we don't 2604 * have any cpu pointers yet. 2605 */ 2606 ASSERT(lpl_leaf->lpl_ncpu == 1); 2607 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2608 } 2609 2610 } 2611 2612 2613 /* 2614 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2615 * 2616 * The lpl (cpu partition load average information) is now arranged in a 2617 * hierarchical fashion whereby resources that are closest, ie. most local, to 2618 * the cpu in question are considered to be leaves in a tree of resources. 2619 * There are two removal cases in question: 2620 * 2621 * 1. Removal of the resource in the leaf leaves other resources remaining in 2622 * that leaf. (Another cpu still exists at this level of locality). In this 2623 * case, the count of available cpus is decremented in all assocated lpls by 2624 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2625 * from the per-cpu lpl list. 2626 * 2627 * 2. Removal of the resource results in the lpl containing no resources. (It's 2628 * empty) In this case, all of what has occurred for the first step must take 2629 * place; however, additionally we must remove the lpl structure itself, prune 2630 * out any stranded lpls that do not directly name a leaf resource, and mark the 2631 * cpu partition in question as no longer containing resources from the lgrp of 2632 * the lpl that has been delted. Cpu-partition changes are handled by this 2633 * method, but the lpl_leaf_remove function deals with the details of pruning 2634 * out the empty lpl and any of its orphaned direct ancestors. 2635 */ 2636 void 2637 lgrp_part_del_cpu(cpu_t *cp) 2638 { 2639 lpl_t *lpl; 2640 lpl_t *leaf_lpl; 2641 lgrp_t *lgrp_leaf; 2642 2643 /* called sometimes w/ cpus paused - grab no locks */ 2644 2645 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2646 2647 lpl = leaf_lpl = cp->cpu_lpl; 2648 lgrp_leaf = leaf_lpl->lpl_lgrp; 2649 2650 /* don't delete a leaf that isn't there */ 2651 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2652 2653 /* no double-deletes */ 2654 ASSERT(lpl->lpl_ncpu); 2655 if (--lpl->lpl_ncpu == 0) { 2656 /* 2657 * This was the last cpu in this lgroup for this partition, 2658 * clear its bit in the partition's lgroup bitmask 2659 */ 2660 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2661 2662 /* eliminate remaning lpl link pointers in cpu, lpl */ 2663 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2664 2665 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2666 } else { 2667 2668 /* unlink cpu from lists of cpus in lpl */ 2669 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2670 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2671 if (lpl->lpl_cpus == cp) { 2672 lpl->lpl_cpus = cp->cpu_next_lpl; 2673 } 2674 2675 /* 2676 * Update the cpu count in the lpls associated with parent 2677 * lgroups. 2678 */ 2679 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2680 2681 } 2682 /* clear cpu's lpl ptr when we're all done */ 2683 cp->cpu_lpl = NULL; 2684 } 2685 2686 /* 2687 * Recompute load average for the specified partition/lgrp fragment. 2688 * 2689 * We rely on the fact that this routine is called from the clock thread 2690 * at a point before the clock thread can block (i.e. before its first 2691 * lock request). Since the clock thread can not be preempted (since it 2692 * runs at highest priority), we know that cpu partitions can not change 2693 * (since doing so would require either the repartition requester or the 2694 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2695 * without grabbing cpu_lock. 2696 */ 2697 void 2698 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2699 { 2700 uint_t ncpu; 2701 int64_t old, new, f; 2702 2703 /* 2704 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2705 */ 2706 static short expval[] = { 2707 0, 3196, 1618, 1083, 2708 814, 652, 543, 466, 2709 408, 363, 326, 297, 2710 272, 251, 233, 218, 2711 204, 192, 181, 172, 2712 163, 155, 148, 142, 2713 136, 130, 125, 121, 2714 116, 112, 109, 105 2715 }; 2716 2717 /* ASSERT (called from clock level) */ 2718 2719 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2720 ((ncpu = lpl->lpl_ncpu) == 0)) { 2721 return; 2722 } 2723 2724 for (;;) { 2725 2726 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2727 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2728 else 2729 f = expval[ncpu]; 2730 2731 /* 2732 * Modify the load average atomically to avoid losing 2733 * anticipatory load updates (see lgrp_move_thread()). 2734 */ 2735 if (ageflag) { 2736 /* 2737 * We're supposed to both update and age the load. 2738 * This happens 10 times/sec. per cpu. We do a 2739 * little hoop-jumping to avoid integer overflow. 2740 */ 2741 int64_t q, r; 2742 2743 do { 2744 old = new = lpl->lpl_loadavg; 2745 q = (old >> 16) << 7; 2746 r = (old & 0xffff) << 7; 2747 new += ((long long)(nrcpus - q) * f - 2748 ((r * f) >> 16)) >> 7; 2749 2750 /* 2751 * Check for overflow 2752 */ 2753 if (new > LGRP_LOADAVG_MAX) 2754 new = LGRP_LOADAVG_MAX; 2755 else if (new < 0) 2756 new = 0; 2757 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2758 new) != old); 2759 } else { 2760 /* 2761 * We're supposed to update the load, but not age it. 2762 * This option is used to update the load (which either 2763 * has already been aged in this 1/10 sec. interval or 2764 * soon will be) to account for a remotely executing 2765 * thread. 2766 */ 2767 do { 2768 old = new = lpl->lpl_loadavg; 2769 new += f; 2770 /* 2771 * Check for overflow 2772 * Underflow not possible here 2773 */ 2774 if (new < old) 2775 new = LGRP_LOADAVG_MAX; 2776 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2777 new) != old); 2778 } 2779 2780 /* 2781 * Do the same for this lpl's parent 2782 */ 2783 if ((lpl = lpl->lpl_parent) == NULL) 2784 break; 2785 ncpu = lpl->lpl_ncpu; 2786 } 2787 } 2788 2789 /* 2790 * Initialize lpl topology in the target based on topology currently present in 2791 * lpl_bootstrap. 2792 * 2793 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2794 * initialize cp_default list of lpls. Up to this point all topology operations 2795 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2796 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2797 * `target' points to the list of lpls in cp_default and `size' is the size of 2798 * this list. 2799 * 2800 * This function walks the lpl topology in lpl_bootstrap and does for things: 2801 * 2802 * 1) Copies all fields from lpl_bootstrap to the target. 2803 * 2804 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2805 * 2806 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2807 * instead of lpl_bootstrap. 2808 * 2809 * 4) Updates pointers in the resource list of the target to point to the lpls 2810 * in the target list instead of lpl_bootstrap. 2811 * 2812 * After lpl_topo_bootstrap() completes, target contains the same information 2813 * that would be present there if it were used during boot instead of 2814 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2815 * and it is bzeroed. 2816 */ 2817 void 2818 lpl_topo_bootstrap(lpl_t *target, int size) 2819 { 2820 lpl_t *lpl = lpl_bootstrap; 2821 lpl_t *target_lpl = target; 2822 lpl_t **rset; 2823 int *id2rset; 2824 int sz; 2825 int howmany; 2826 int id; 2827 int i; 2828 2829 /* 2830 * The only target that should be passed here is cp_default lpl list. 2831 */ 2832 ASSERT(target == cp_default.cp_lgrploads); 2833 ASSERT(size == cp_default.cp_nlgrploads); 2834 ASSERT(!lgrp_topo_initialized); 2835 ASSERT(ncpus == 1); 2836 2837 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2838 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2839 /* 2840 * Copy all fields from lpl, except for the rset, 2841 * lgrp id <=> rset mapping storage, 2842 * and amount of storage 2843 */ 2844 rset = target_lpl->lpl_rset; 2845 id2rset = target_lpl->lpl_id2rset; 2846 sz = target_lpl->lpl_rset_sz; 2847 2848 *target_lpl = *lpl; 2849 2850 target_lpl->lpl_rset_sz = sz; 2851 target_lpl->lpl_rset = rset; 2852 target_lpl->lpl_id2rset = id2rset; 2853 2854 /* 2855 * Substitute CPU0 lpl pointer with one relative to target. 2856 */ 2857 if (lpl->lpl_cpus == CPU) { 2858 ASSERT(CPU->cpu_lpl == lpl); 2859 CPU->cpu_lpl = target_lpl; 2860 } 2861 2862 /* 2863 * Substitute parent information with parent relative to target. 2864 */ 2865 if (lpl->lpl_parent != NULL) 2866 target_lpl->lpl_parent = (lpl_t *) 2867 (((uintptr_t)lpl->lpl_parent - 2868 (uintptr_t)lpl_bootstrap) + 2869 (uintptr_t)target); 2870 2871 /* 2872 * Walk over resource set substituting pointers relative to 2873 * lpl_bootstrap's rset to pointers relative to target's 2874 */ 2875 ASSERT(lpl->lpl_nrset <= 1); 2876 2877 for (id = 0; id < lpl->lpl_nrset; id++) { 2878 if (lpl->lpl_rset[id] != NULL) { 2879 target_lpl->lpl_rset[id] = (lpl_t *) 2880 (((uintptr_t)lpl->lpl_rset[id] - 2881 (uintptr_t)lpl_bootstrap) + 2882 (uintptr_t)target); 2883 } 2884 target_lpl->lpl_id2rset[id] = 2885 lpl->lpl_id2rset[id]; 2886 } 2887 } 2888 2889 /* 2890 * Clean up the bootstrap lpls since we have switched over to the 2891 * actual lpl array in the default cpu partition. 2892 * 2893 * We still need to keep one empty lpl around for newly starting 2894 * slave CPUs to reference should they need to make it through the 2895 * dispatcher prior to their lgrp/lpl initialization. 2896 * 2897 * The lpl related dispatcher code has been designed to work properly 2898 * (and without extra checks) for this special case of a zero'ed 2899 * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl 2900 * with lgrpid 0 and an empty resource set. Iteration over the rset 2901 * array by the dispatcher is also NULL terminated for this reason. 2902 * 2903 * This provides the desired behaviour for an uninitialized CPU. 2904 * It shouldn't see any other CPU to either dispatch to or steal 2905 * from until it is properly initialized. 2906 */ 2907 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2908 bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset)); 2909 bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset)); 2910 2911 lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset; 2912 lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset; 2913 } 2914 2915 /* 2916 * If the lowest load among the lgroups a process' threads are currently 2917 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2918 * expanding the process to a new lgroup. 2919 */ 2920 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2921 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2922 2923 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2924 ((lgrp_expand_proc_thresh) / (ncpu)) 2925 2926 /* 2927 * A process will be expanded to a new lgroup only if the difference between 2928 * the lowest load on the lgroups the process' thread's are currently spread 2929 * across and the lowest load on the other lgroups in the process' partition 2930 * is greater than lgrp_expand_proc_diff. 2931 */ 2932 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2933 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2934 2935 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2936 ((lgrp_expand_proc_diff) / (ncpu)) 2937 2938 /* 2939 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2940 * be present due to impreciseness of the load average decay algorithm. 2941 * 2942 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2943 * tolerance is scaled by the number of cpus in the lgroup just like 2944 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2945 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2946 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2947 */ 2948 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2949 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2950 ((lgrp_loadavg_tolerance) / ncpu) 2951 2952 /* 2953 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2954 * average is above this threshold 2955 */ 2956 uint32_t lgrp_load_thresh = UINT32_MAX; 2957 2958 /* 2959 * lgrp_choose() will try to skip any lgroups with less memory 2960 * than this free when choosing a home lgroup 2961 */ 2962 pgcnt_t lgrp_mem_free_thresh = 0; 2963 2964 /* 2965 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 2966 * one based on one of the following policies: 2967 * - Random selection 2968 * - Pseudo round robin placement 2969 * - Longest time since a thread was last placed 2970 */ 2971 #define LGRP_CHOOSE_RANDOM 1 2972 #define LGRP_CHOOSE_RR 2 2973 #define LGRP_CHOOSE_TIME 3 2974 2975 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 2976 2977 /* 2978 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 2979 * be bound to a CPU or processor set. 2980 * 2981 * Arguments: 2982 * t The thread 2983 * cpupart The partition the thread belongs to. 2984 * 2985 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 2986 * disabled, or thread_lock held (at splhigh) to protect against the CPU 2987 * partitions changing out from under us and assumes that given thread is 2988 * protected. Also, called sometimes w/ cpus paused or kernel preemption 2989 * disabled, so don't grab any locks because we should never block under 2990 * those conditions. 2991 */ 2992 lpl_t * 2993 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 2994 { 2995 lgrp_load_t bestload, bestrload; 2996 int lgrpid_offset, lgrp_count; 2997 lgrp_id_t lgrpid, lgrpid_start; 2998 lpl_t *lpl, *bestlpl, *bestrlpl; 2999 klgrpset_t lgrpset; 3000 proc_t *p; 3001 3002 ASSERT(t != NULL); 3003 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3004 THREAD_LOCK_HELD(t)); 3005 ASSERT(cpupart != NULL); 3006 3007 p = t->t_procp; 3008 3009 /* A process should always be in an active partition */ 3010 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3011 3012 bestlpl = bestrlpl = NULL; 3013 bestload = bestrload = LGRP_LOADAVG_MAX; 3014 lgrpset = cpupart->cp_lgrpset; 3015 3016 switch (lgrp_choose_policy) { 3017 case LGRP_CHOOSE_RR: 3018 lgrpid = cpupart->cp_lgrp_hint; 3019 do { 3020 if (++lgrpid > lgrp_alloc_max) 3021 lgrpid = 0; 3022 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3023 3024 break; 3025 default: 3026 case LGRP_CHOOSE_TIME: 3027 case LGRP_CHOOSE_RANDOM: 3028 klgrpset_nlgrps(lgrpset, lgrp_count); 3029 lgrpid_offset = 3030 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3031 for (lgrpid = 0; ; lgrpid++) { 3032 if (klgrpset_ismember(lgrpset, lgrpid)) { 3033 if (--lgrpid_offset == 0) 3034 break; 3035 } 3036 } 3037 break; 3038 } 3039 3040 lgrpid_start = lgrpid; 3041 3042 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3043 lgrp_id_t, cpupart->cp_lgrp_hint); 3044 3045 /* 3046 * Use lgroup affinities (if any) to choose best lgroup 3047 * 3048 * NOTE: Assumes that thread is protected from going away and its 3049 * lgroup affinities won't change (ie. p_lock, or 3050 * thread_lock() being held and/or CPUs paused) 3051 */ 3052 if (t->t_lgrp_affinity) { 3053 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE); 3054 if (lpl != NULL) 3055 return (lpl); 3056 } 3057 3058 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3059 3060 do { 3061 pgcnt_t npgs; 3062 3063 /* 3064 * Skip any lgroups outside of thread's pset 3065 */ 3066 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3067 if (++lgrpid > lgrp_alloc_max) 3068 lgrpid = 0; /* wrap the search */ 3069 continue; 3070 } 3071 3072 /* 3073 * Skip any non-leaf lgroups 3074 */ 3075 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3076 continue; 3077 3078 /* 3079 * Skip any lgroups without enough free memory 3080 * (when threshold set to nonzero positive value) 3081 */ 3082 if (lgrp_mem_free_thresh > 0) { 3083 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3084 if (npgs < lgrp_mem_free_thresh) { 3085 if (++lgrpid > lgrp_alloc_max) 3086 lgrpid = 0; /* wrap the search */ 3087 continue; 3088 } 3089 } 3090 3091 lpl = &cpupart->cp_lgrploads[lgrpid]; 3092 if (klgrpset_isempty(p->p_lgrpset) || 3093 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3094 /* 3095 * Either this is a new process or the process already 3096 * has threads on this lgrp, so this is a preferred 3097 * lgroup for the thread. 3098 */ 3099 if (bestlpl == NULL || 3100 lpl_pick(lpl, bestlpl)) { 3101 bestload = lpl->lpl_loadavg; 3102 bestlpl = lpl; 3103 } 3104 } else { 3105 /* 3106 * The process doesn't have any threads on this lgrp, 3107 * but we're willing to consider this lgrp if the load 3108 * difference is big enough to justify splitting up 3109 * the process' threads. 3110 */ 3111 if (bestrlpl == NULL || 3112 lpl_pick(lpl, bestrlpl)) { 3113 bestrload = lpl->lpl_loadavg; 3114 bestrlpl = lpl; 3115 } 3116 } 3117 if (++lgrpid > lgrp_alloc_max) 3118 lgrpid = 0; /* wrap the search */ 3119 } while (lgrpid != lgrpid_start); 3120 3121 /* 3122 * Return root lgroup if threshold isn't set to maximum value and 3123 * lowest lgroup load average more than a certain threshold 3124 */ 3125 if (lgrp_load_thresh != UINT32_MAX && 3126 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3127 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3128 3129 /* 3130 * If all the lgroups over which the thread's process is spread are 3131 * heavily loaded, or otherwise undesirable, we'll consider placing 3132 * the thread on one of the other leaf lgroups in the thread's 3133 * partition. 3134 */ 3135 if ((bestlpl == NULL) || 3136 ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3137 (bestrload < bestload) && /* paranoid about wraparound */ 3138 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3139 bestload))) { 3140 bestlpl = bestrlpl; 3141 } 3142 3143 if (bestlpl == NULL) { 3144 /* 3145 * No lgroup looked particularly good, but we still 3146 * have to pick something. Go with the randomly selected 3147 * legal lgroup we started with above. 3148 */ 3149 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3150 } 3151 3152 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3153 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3154 3155 ASSERT(bestlpl->lpl_ncpu > 0); 3156 return (bestlpl); 3157 } 3158 3159 /* 3160 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. 3161 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. 3162 */ 3163 static int 3164 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3165 { 3166 lgrp_load_t l1, l2; 3167 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3168 3169 l1 = lpl1->lpl_loadavg; 3170 l2 = lpl2->lpl_loadavg; 3171 3172 if ((l1 + tolerance < l2) && (l1 < l2)) { 3173 /* lpl1 is significantly less loaded than lpl2 */ 3174 return (1); 3175 } 3176 3177 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3178 l1 + tolerance >= l2 && l1 < l2 && 3179 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3180 /* 3181 * lpl1's load is within the tolerance of lpl2. We're 3182 * willing to consider it be to better however if 3183 * it has been longer since we last homed a thread there 3184 */ 3185 return (1); 3186 } 3187 3188 return (0); 3189 } 3190 3191 /* 3192 * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a 3193 * process that uses text replication changed home lgrp. This info is used by 3194 * segvn asyncronous thread to detect if it needs to recheck what lgrps 3195 * should be used for text replication. 3196 */ 3197 static uint64_t lgrp_trthr_moves = 0; 3198 3199 uint64_t 3200 lgrp_get_trthr_migrations(void) 3201 { 3202 return (lgrp_trthr_moves); 3203 } 3204 3205 void 3206 lgrp_update_trthr_migrations(uint64_t incr) 3207 { 3208 atomic_add_64(&lgrp_trthr_moves, incr); 3209 } 3210 3211 /* 3212 * An LWP is expected to be assigned to an lgroup for at least this long 3213 * for its anticipatory load to be justified. NOTE that this value should 3214 * not be set extremely huge (say, larger than 100 years), to avoid problems 3215 * with overflow in the calculation that uses it. 3216 */ 3217 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3218 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3219 3220 /* 3221 * Routine to change a thread's lgroup affiliation. This routine updates 3222 * the thread's kthread_t struct and its process' proc_t struct to note the 3223 * thread's new lgroup affiliation, and its lgroup affinities. 3224 * 3225 * Note that this is the only routine that modifies a thread's t_lpl field, 3226 * and that adds in or removes anticipatory load. 3227 * 3228 * If the thread is exiting, newlpl is NULL. 3229 * 3230 * Locking: 3231 * The following lock must be held on entry: 3232 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3233 * doesn't get removed from t's partition 3234 * 3235 * This routine is not allowed to grab any locks, since it may be called 3236 * with cpus paused (such as from cpu_offline). 3237 */ 3238 void 3239 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3240 { 3241 proc_t *p; 3242 lpl_t *lpl, *oldlpl; 3243 lgrp_id_t oldid; 3244 kthread_t *tp; 3245 uint_t ncpu; 3246 lgrp_load_t old, new; 3247 3248 ASSERT(t); 3249 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3250 THREAD_LOCK_HELD(t)); 3251 3252 /* 3253 * If not changing lpls, just return 3254 */ 3255 if ((oldlpl = t->t_lpl) == newlpl) 3256 return; 3257 3258 /* 3259 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3260 * associated with process 0 rather than with its original process). 3261 */ 3262 if (t->t_proc_flag & TP_LWPEXIT) { 3263 if (newlpl != NULL) { 3264 t->t_lpl = newlpl; 3265 } 3266 return; 3267 } 3268 3269 p = ttoproc(t); 3270 3271 /* 3272 * If the thread had a previous lgroup, update its process' p_lgrpset 3273 * to account for it being moved from its old lgroup. 3274 */ 3275 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3276 (p->p_tlist != NULL)) { 3277 oldid = oldlpl->lpl_lgrpid; 3278 3279 if (newlpl != NULL) 3280 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3281 3282 if ((do_lgrpset_delete) && 3283 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3284 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3285 /* 3286 * Check if a thread other than the thread 3287 * that's moving is assigned to the same 3288 * lgroup as the thread that's moving. Note 3289 * that we have to compare lgroup IDs, rather 3290 * than simply comparing t_lpl's, since the 3291 * threads may belong to different partitions 3292 * but be assigned to the same lgroup. 3293 */ 3294 ASSERT(tp->t_lpl != NULL); 3295 3296 if ((tp != t) && 3297 (tp->t_lpl->lpl_lgrpid == oldid)) { 3298 /* 3299 * Another thread is assigned to the 3300 * same lgroup as the thread that's 3301 * moving, p_lgrpset doesn't change. 3302 */ 3303 break; 3304 } else if (tp == p->p_tlist) { 3305 /* 3306 * No other thread is assigned to the 3307 * same lgroup as the exiting thread, 3308 * clear the lgroup's bit in p_lgrpset. 3309 */ 3310 klgrpset_del(p->p_lgrpset, oldid); 3311 break; 3312 } 3313 } 3314 } 3315 3316 /* 3317 * If this thread was assigned to its old lgroup for such a 3318 * short amount of time that the anticipatory load that was 3319 * added on its behalf has aged very little, remove that 3320 * anticipatory load. 3321 */ 3322 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3323 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3324 lpl = oldlpl; 3325 for (;;) { 3326 do { 3327 old = new = lpl->lpl_loadavg; 3328 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3329 if (new > old) { 3330 /* 3331 * this can happen if the load 3332 * average was aged since we 3333 * added in the anticipatory 3334 * load 3335 */ 3336 new = 0; 3337 } 3338 } while (cas32( 3339 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3340 new) != old); 3341 3342 lpl = lpl->lpl_parent; 3343 if (lpl == NULL) 3344 break; 3345 3346 ncpu = lpl->lpl_ncpu; 3347 ASSERT(ncpu > 0); 3348 } 3349 } 3350 } 3351 /* 3352 * If the thread has a new lgroup (i.e. it's not exiting), update its 3353 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3354 * to its new lgroup to account for its move to its new lgroup. 3355 */ 3356 if (newlpl != NULL) { 3357 /* 3358 * This thread is moving to a new lgroup 3359 */ 3360 t->t_lpl = newlpl; 3361 if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) { 3362 p->p_t1_lgrpid = newlpl->lpl_lgrpid; 3363 membar_producer(); 3364 if (p->p_tr_lgrpid != LGRP_NONE && 3365 p->p_tr_lgrpid != p->p_t1_lgrpid) { 3366 lgrp_update_trthr_migrations(1); 3367 } 3368 } 3369 3370 /* 3371 * Reflect move in load average of new lgroup 3372 * unless it is root lgroup 3373 */ 3374 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3375 return; 3376 3377 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3378 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3379 } 3380 3381 /* 3382 * It'll take some time for the load on the new lgroup 3383 * to reflect this thread's placement on it. We'd 3384 * like not, however, to have all threads between now 3385 * and then also piling on to this lgroup. To avoid 3386 * this pileup, we anticipate the load this thread 3387 * will generate on its new lgroup. The goal is to 3388 * make the lgroup's load appear as though the thread 3389 * had been there all along. We're very conservative 3390 * in calculating this anticipatory load, we assume 3391 * the worst case case (100% CPU-bound thread). This 3392 * may be modified in the future to be more accurate. 3393 */ 3394 lpl = newlpl; 3395 for (;;) { 3396 ncpu = lpl->lpl_ncpu; 3397 ASSERT(ncpu > 0); 3398 do { 3399 old = new = lpl->lpl_loadavg; 3400 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3401 /* 3402 * Check for overflow 3403 * Underflow not possible here 3404 */ 3405 if (new < old) 3406 new = UINT32_MAX; 3407 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 3408 new) != old); 3409 3410 lpl = lpl->lpl_parent; 3411 if (lpl == NULL) 3412 break; 3413 } 3414 t->t_anttime = gethrtime(); 3415 } 3416 } 3417 3418 /* 3419 * Return lgroup memory allocation policy given advice from madvise(3C) 3420 */ 3421 lgrp_mem_policy_t 3422 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3423 { 3424 switch (advice) { 3425 case MADV_ACCESS_LWP: 3426 return (LGRP_MEM_POLICY_NEXT); 3427 case MADV_ACCESS_MANY: 3428 return (LGRP_MEM_POLICY_RANDOM); 3429 default: 3430 return (lgrp_mem_policy_default(size, type)); 3431 } 3432 } 3433 3434 /* 3435 * Figure out default policy 3436 */ 3437 lgrp_mem_policy_t 3438 lgrp_mem_policy_default(size_t size, int type) 3439 { 3440 cpupart_t *cp; 3441 lgrp_mem_policy_t policy; 3442 size_t pset_mem_size; 3443 3444 /* 3445 * Randomly allocate memory across lgroups for shared memory 3446 * beyond a certain threshold 3447 */ 3448 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3449 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3450 /* 3451 * Get total memory size of current thread's pset 3452 */ 3453 kpreempt_disable(); 3454 cp = curthread->t_cpupart; 3455 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3456 kpreempt_enable(); 3457 3458 /* 3459 * Choose policy to randomly allocate memory across 3460 * lgroups in pset if it will fit and is not default 3461 * partition. Otherwise, allocate memory randomly 3462 * across machine. 3463 */ 3464 if (lgrp_mem_pset_aware && size < pset_mem_size) 3465 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3466 else 3467 policy = LGRP_MEM_POLICY_RANDOM; 3468 } else 3469 /* 3470 * Apply default policy for private memory and 3471 * shared memory under the respective random 3472 * threshold. 3473 */ 3474 policy = lgrp_mem_default_policy; 3475 3476 return (policy); 3477 } 3478 3479 /* 3480 * Get memory allocation policy for this segment 3481 */ 3482 lgrp_mem_policy_info_t * 3483 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3484 { 3485 lgrp_mem_policy_info_t *policy_info; 3486 extern struct seg_ops segspt_ops; 3487 extern struct seg_ops segspt_shmops; 3488 3489 /* 3490 * This is for binary compatibility to protect against third party 3491 * segment drivers which haven't recompiled to allow for 3492 * SEGOP_GETPOLICY() 3493 */ 3494 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3495 seg->s_ops != &segspt_shmops) 3496 return (NULL); 3497 3498 policy_info = NULL; 3499 if (seg->s_ops->getpolicy != NULL) 3500 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3501 3502 return (policy_info); 3503 } 3504 3505 /* 3506 * Set policy for allocating private memory given desired policy, policy info, 3507 * size in bytes of memory that policy is being applied. 3508 * Return 0 if policy wasn't set already and 1 if policy was set already 3509 */ 3510 int 3511 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3512 lgrp_mem_policy_info_t *policy_info, size_t size) 3513 { 3514 3515 ASSERT(policy_info != NULL); 3516 3517 if (policy == LGRP_MEM_POLICY_DEFAULT) 3518 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3519 3520 /* 3521 * Policy set already? 3522 */ 3523 if (policy == policy_info->mem_policy) 3524 return (1); 3525 3526 /* 3527 * Set policy 3528 */ 3529 policy_info->mem_policy = policy; 3530 policy_info->mem_lgrpid = LGRP_NONE; 3531 3532 return (0); 3533 } 3534 3535 3536 /* 3537 * Get shared memory allocation policy with given tree and offset 3538 */ 3539 lgrp_mem_policy_info_t * 3540 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3541 u_offset_t vn_off) 3542 { 3543 u_offset_t off; 3544 lgrp_mem_policy_info_t *policy_info; 3545 lgrp_shm_policy_seg_t *policy_seg; 3546 lgrp_shm_locality_t *shm_locality; 3547 avl_tree_t *tree; 3548 avl_index_t where; 3549 3550 /* 3551 * Get policy segment tree from anon_map or vnode and use specified 3552 * anon index or vnode offset as offset 3553 * 3554 * Assume that no lock needs to be held on anon_map or vnode, since 3555 * they should be protected by their reference count which must be 3556 * nonzero for an existing segment 3557 */ 3558 if (amp) { 3559 ASSERT(amp->refcnt != 0); 3560 shm_locality = amp->locality; 3561 if (shm_locality == NULL) 3562 return (NULL); 3563 tree = shm_locality->loc_tree; 3564 off = ptob(anon_index); 3565 } else if (vp) { 3566 shm_locality = vp->v_locality; 3567 if (shm_locality == NULL) 3568 return (NULL); 3569 ASSERT(shm_locality->loc_count != 0); 3570 tree = shm_locality->loc_tree; 3571 off = vn_off; 3572 } 3573 3574 if (tree == NULL) 3575 return (NULL); 3576 3577 /* 3578 * Lookup policy segment for offset into shared object and return 3579 * policy info 3580 */ 3581 rw_enter(&shm_locality->loc_lock, RW_READER); 3582 policy_info = NULL; 3583 policy_seg = avl_find(tree, &off, &where); 3584 if (policy_seg) 3585 policy_info = &policy_seg->shm_policy; 3586 rw_exit(&shm_locality->loc_lock); 3587 3588 return (policy_info); 3589 } 3590 3591 /* 3592 * Default memory allocation policy for kernel segmap pages 3593 */ 3594 lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM; 3595 3596 /* 3597 * Return lgroup to use for allocating memory 3598 * given the segment and address 3599 * 3600 * There isn't any mutual exclusion that exists between calls 3601 * to this routine and DR, so this routine and whomever calls it 3602 * should be mindful of the possibility that the lgrp returned 3603 * may be deleted. If this happens, dereferences of the lgrp 3604 * pointer will still be safe, but the resources in the lgrp will 3605 * be gone, and LGRP_EXISTS() will no longer be true. 3606 */ 3607 lgrp_t * 3608 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3609 { 3610 int i; 3611 lgrp_t *lgrp; 3612 klgrpset_t lgrpset; 3613 int lgrps_spanned; 3614 unsigned long off; 3615 lgrp_mem_policy_t policy; 3616 lgrp_mem_policy_info_t *policy_info; 3617 ushort_t random; 3618 int stat = 0; 3619 extern struct seg *segkmap; 3620 3621 /* 3622 * Just return null if the lgrp framework hasn't finished 3623 * initializing or if this is a UMA machine. 3624 */ 3625 if (nlgrps == 1 || !lgrp_initialized) 3626 return (lgrp_root); 3627 3628 /* 3629 * Get memory allocation policy for this segment 3630 */ 3631 policy = lgrp_mem_default_policy; 3632 if (seg != NULL) { 3633 if (seg->s_as == &kas) { 3634 if (seg == segkmap) 3635 policy = lgrp_segmap_default_policy; 3636 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3637 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3638 policy = LGRP_MEM_POLICY_RANDOM; 3639 } else { 3640 policy_info = lgrp_mem_policy_get(seg, vaddr); 3641 if (policy_info != NULL) { 3642 policy = policy_info->mem_policy; 3643 if (policy == LGRP_MEM_POLICY_NEXT_SEG) { 3644 lgrp_id_t id = policy_info->mem_lgrpid; 3645 ASSERT(id != LGRP_NONE); 3646 ASSERT(id < NLGRPS_MAX); 3647 lgrp = lgrp_table[id]; 3648 if (!LGRP_EXISTS(lgrp)) { 3649 policy = LGRP_MEM_POLICY_NEXT; 3650 } else { 3651 lgrp_stat_add(id, 3652 LGRP_NUM_NEXT_SEG, 1); 3653 return (lgrp); 3654 } 3655 } 3656 } 3657 } 3658 } 3659 lgrpset = 0; 3660 3661 /* 3662 * Initialize lgroup to home by default 3663 */ 3664 lgrp = lgrp_home_lgrp(); 3665 3666 /* 3667 * When homing threads on root lgrp, override default memory 3668 * allocation policies with root lgroup memory allocation policy 3669 */ 3670 if (lgrp == lgrp_root) 3671 policy = lgrp_mem_policy_root; 3672 3673 /* 3674 * Implement policy 3675 */ 3676 switch (policy) { 3677 case LGRP_MEM_POLICY_NEXT_CPU: 3678 3679 /* 3680 * Return lgroup of current CPU which faulted on memory 3681 * If the CPU isn't currently in an lgrp, then opt to 3682 * allocate from the root. 3683 * 3684 * Kernel preemption needs to be disabled here to prevent 3685 * the current CPU from going away before lgrp is found. 3686 */ 3687 if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 3688 lgrp = lgrp_root; 3689 } else { 3690 kpreempt_disable(); 3691 lgrp = lgrp_cpu_to_lgrp(CPU); 3692 kpreempt_enable(); 3693 } 3694 break; 3695 3696 case LGRP_MEM_POLICY_NEXT: 3697 case LGRP_MEM_POLICY_DEFAULT: 3698 default: 3699 3700 /* 3701 * Just return current thread's home lgroup 3702 * for default policy (next touch) 3703 * If the thread is homed to the root, 3704 * then the default policy is random across lgroups. 3705 * Fallthrough to the random case. 3706 */ 3707 if (lgrp != lgrp_root) { 3708 if (policy == LGRP_MEM_POLICY_NEXT) 3709 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3710 else 3711 lgrp_stat_add(lgrp->lgrp_id, 3712 LGRP_NUM_DEFAULT, 1); 3713 break; 3714 } 3715 /* LINTED fallthrough on case statement */ 3716 case LGRP_MEM_POLICY_RANDOM: 3717 3718 /* 3719 * Return a random leaf lgroup with memory 3720 */ 3721 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3722 /* 3723 * Count how many lgroups are spanned 3724 */ 3725 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3726 3727 /* 3728 * There may be no memnodes in the root lgroup during DR copy 3729 * rename on a system with only two boards (memnodes) 3730 * configured. In this case just return the root lgrp. 3731 */ 3732 if (lgrps_spanned == 0) { 3733 lgrp = lgrp_root; 3734 break; 3735 } 3736 3737 /* 3738 * Pick a random offset within lgroups spanned 3739 * and return lgroup at that offset 3740 */ 3741 random = (ushort_t)gethrtime() >> 4; 3742 off = random % lgrps_spanned; 3743 ASSERT(off <= lgrp_alloc_max); 3744 3745 for (i = 0; i <= lgrp_alloc_max; i++) { 3746 if (!klgrpset_ismember(lgrpset, i)) 3747 continue; 3748 if (off) 3749 off--; 3750 else { 3751 lgrp = lgrp_table[i]; 3752 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3753 1); 3754 break; 3755 } 3756 } 3757 break; 3758 3759 case LGRP_MEM_POLICY_RANDOM_PROC: 3760 3761 /* 3762 * Grab copy of bitmask of lgroups spanned by 3763 * this process 3764 */ 3765 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3766 stat = LGRP_NUM_RANDOM_PROC; 3767 3768 /* LINTED fallthrough on case statement */ 3769 case LGRP_MEM_POLICY_RANDOM_PSET: 3770 3771 if (!stat) 3772 stat = LGRP_NUM_RANDOM_PSET; 3773 3774 if (klgrpset_isempty(lgrpset)) { 3775 /* 3776 * Grab copy of bitmask of lgroups spanned by 3777 * this processor set 3778 */ 3779 kpreempt_disable(); 3780 klgrpset_copy(lgrpset, 3781 curthread->t_cpupart->cp_lgrpset); 3782 kpreempt_enable(); 3783 } 3784 3785 /* 3786 * Count how many lgroups are spanned 3787 */ 3788 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3789 ASSERT(lgrps_spanned <= nlgrps); 3790 3791 /* 3792 * Probably lgrps_spanned should be always non-zero, but to be 3793 * on the safe side we return lgrp_root if it is empty. 3794 */ 3795 if (lgrps_spanned == 0) { 3796 lgrp = lgrp_root; 3797 break; 3798 } 3799 3800 /* 3801 * Pick a random offset within lgroups spanned 3802 * and return lgroup at that offset 3803 */ 3804 random = (ushort_t)gethrtime() >> 4; 3805 off = random % lgrps_spanned; 3806 ASSERT(off <= lgrp_alloc_max); 3807 3808 for (i = 0; i <= lgrp_alloc_max; i++) { 3809 if (!klgrpset_ismember(lgrpset, i)) 3810 continue; 3811 if (off) 3812 off--; 3813 else { 3814 lgrp = lgrp_table[i]; 3815 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3816 1); 3817 break; 3818 } 3819 } 3820 break; 3821 3822 case LGRP_MEM_POLICY_ROUNDROBIN: 3823 3824 /* 3825 * Use offset within segment to determine 3826 * offset from home lgroup to choose for 3827 * next lgroup to allocate memory from 3828 */ 3829 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3830 (lgrp_alloc_max + 1); 3831 3832 kpreempt_disable(); 3833 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3834 i = lgrp->lgrp_id; 3835 kpreempt_enable(); 3836 3837 while (off > 0) { 3838 i = (i + 1) % (lgrp_alloc_max + 1); 3839 lgrp = lgrp_table[i]; 3840 if (klgrpset_ismember(lgrpset, i)) 3841 off--; 3842 } 3843 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3844 3845 break; 3846 } 3847 3848 ASSERT(lgrp != NULL); 3849 return (lgrp); 3850 } 3851 3852 /* 3853 * Return the number of pages in an lgroup 3854 * 3855 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3856 * could cause tests that rely on the numat driver to fail.... 3857 */ 3858 pgcnt_t 3859 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3860 { 3861 lgrp_t *lgrp; 3862 3863 lgrp = lgrp_table[lgrpid]; 3864 if (!LGRP_EXISTS(lgrp) || 3865 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3866 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3867 return (0); 3868 3869 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3870 } 3871 3872 /* 3873 * Initialize lgroup shared memory allocation policy support 3874 */ 3875 void 3876 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3877 { 3878 lgrp_shm_locality_t *shm_locality; 3879 3880 /* 3881 * Initialize locality field in anon_map 3882 * Don't need any locks because this is called when anon_map is 3883 * allocated, but not used anywhere yet. 3884 */ 3885 if (amp) { 3886 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3887 if (amp->locality == NULL) { 3888 /* 3889 * Allocate and initialize shared memory locality info 3890 * and set anon_map locality pointer to it 3891 * Drop lock across kmem_alloc(KM_SLEEP) 3892 */ 3893 ANON_LOCK_EXIT(&->a_rwlock); 3894 shm_locality = kmem_alloc(sizeof (*shm_locality), 3895 KM_SLEEP); 3896 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3897 NULL); 3898 shm_locality->loc_count = 1; /* not used for amp */ 3899 shm_locality->loc_tree = NULL; 3900 3901 /* 3902 * Reacquire lock and check to see whether anyone beat 3903 * us to initializing the locality info 3904 */ 3905 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3906 if (amp->locality != NULL) { 3907 rw_destroy(&shm_locality->loc_lock); 3908 kmem_free(shm_locality, 3909 sizeof (*shm_locality)); 3910 } else 3911 amp->locality = shm_locality; 3912 } 3913 ANON_LOCK_EXIT(&->a_rwlock); 3914 return; 3915 } 3916 3917 /* 3918 * Allocate shared vnode policy info if vnode is not locality aware yet 3919 */ 3920 mutex_enter(&vp->v_lock); 3921 if ((vp->v_flag & V_LOCALITY) == 0) { 3922 /* 3923 * Allocate and initialize shared memory locality info 3924 */ 3925 mutex_exit(&vp->v_lock); 3926 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3927 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3928 shm_locality->loc_count = 1; 3929 shm_locality->loc_tree = NULL; 3930 3931 /* 3932 * Point vnode locality field at shared vnode policy info 3933 * and set locality aware flag in vnode 3934 */ 3935 mutex_enter(&vp->v_lock); 3936 if ((vp->v_flag & V_LOCALITY) == 0) { 3937 vp->v_locality = shm_locality; 3938 vp->v_flag |= V_LOCALITY; 3939 } else { 3940 /* 3941 * Lost race so free locality info and increment count. 3942 */ 3943 rw_destroy(&shm_locality->loc_lock); 3944 kmem_free(shm_locality, sizeof (*shm_locality)); 3945 shm_locality = vp->v_locality; 3946 shm_locality->loc_count++; 3947 } 3948 mutex_exit(&vp->v_lock); 3949 3950 return; 3951 } 3952 3953 /* 3954 * Increment reference count of number of segments mapping this vnode 3955 * shared 3956 */ 3957 shm_locality = vp->v_locality; 3958 shm_locality->loc_count++; 3959 mutex_exit(&vp->v_lock); 3960 } 3961 3962 /* 3963 * Destroy the given shared memory policy segment tree 3964 */ 3965 void 3966 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3967 { 3968 lgrp_shm_policy_seg_t *cur; 3969 lgrp_shm_policy_seg_t *next; 3970 3971 if (tree == NULL) 3972 return; 3973 3974 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3975 while (cur != NULL) { 3976 next = AVL_NEXT(tree, cur); 3977 avl_remove(tree, cur); 3978 kmem_free(cur, sizeof (*cur)); 3979 cur = next; 3980 } 3981 kmem_free(tree, sizeof (avl_tree_t)); 3982 } 3983 3984 /* 3985 * Uninitialize lgroup shared memory allocation policy support 3986 */ 3987 void 3988 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 3989 { 3990 lgrp_shm_locality_t *shm_locality; 3991 3992 /* 3993 * For anon_map, deallocate shared memory policy tree and 3994 * zero locality field 3995 * Don't need any locks because anon_map is being freed 3996 */ 3997 if (amp) { 3998 if (amp->locality == NULL) 3999 return; 4000 shm_locality = amp->locality; 4001 shm_locality->loc_count = 0; /* not really used for amp */ 4002 rw_destroy(&shm_locality->loc_lock); 4003 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4004 kmem_free(shm_locality, sizeof (*shm_locality)); 4005 amp->locality = 0; 4006 return; 4007 } 4008 4009 /* 4010 * For vnode, decrement reference count of segments mapping this vnode 4011 * shared and delete locality info if reference count drops to 0 4012 */ 4013 mutex_enter(&vp->v_lock); 4014 shm_locality = vp->v_locality; 4015 shm_locality->loc_count--; 4016 4017 if (shm_locality->loc_count == 0) { 4018 rw_destroy(&shm_locality->loc_lock); 4019 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4020 kmem_free(shm_locality, sizeof (*shm_locality)); 4021 vp->v_locality = 0; 4022 vp->v_flag &= ~V_LOCALITY; 4023 } 4024 mutex_exit(&vp->v_lock); 4025 } 4026 4027 /* 4028 * Compare two shared memory policy segments 4029 * Used by AVL tree code for searching 4030 */ 4031 int 4032 lgrp_shm_policy_compar(const void *x, const void *y) 4033 { 4034 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 4035 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 4036 4037 if (a->shm_off < b->shm_off) 4038 return (-1); 4039 if (a->shm_off >= b->shm_off + b->shm_size) 4040 return (1); 4041 return (0); 4042 } 4043 4044 /* 4045 * Concatenate seg1 with seg2 and remove seg2 4046 */ 4047 static int 4048 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 4049 lgrp_shm_policy_seg_t *seg2) 4050 { 4051 if (!seg1 || !seg2 || 4052 seg1->shm_off + seg1->shm_size != seg2->shm_off || 4053 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 4054 return (-1); 4055 4056 seg1->shm_size += seg2->shm_size; 4057 avl_remove(tree, seg2); 4058 kmem_free(seg2, sizeof (*seg2)); 4059 return (0); 4060 } 4061 4062 /* 4063 * Split segment at given offset and return rightmost (uppermost) segment 4064 * Assumes that there are no overlapping segments 4065 */ 4066 static lgrp_shm_policy_seg_t * 4067 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 4068 u_offset_t off) 4069 { 4070 lgrp_shm_policy_seg_t *newseg; 4071 avl_index_t where; 4072 4073 ASSERT(seg != NULL); 4074 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4075 4076 if (!seg || off < seg->shm_off || off > seg->shm_off + 4077 seg->shm_size) 4078 return (NULL); 4079 4080 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4081 return (seg); 4082 4083 /* 4084 * Adjust size of left segment and allocate new (right) segment 4085 */ 4086 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4087 newseg->shm_policy = seg->shm_policy; 4088 newseg->shm_off = off; 4089 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4090 seg->shm_size = off - seg->shm_off; 4091 4092 /* 4093 * Find where to insert new segment in AVL tree and insert it 4094 */ 4095 (void) avl_find(tree, &off, &where); 4096 avl_insert(tree, newseg, where); 4097 4098 return (newseg); 4099 } 4100 4101 /* 4102 * Set shared memory allocation policy on specified shared object at given 4103 * offset and length 4104 * 4105 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4106 * -1 if can't set policy. 4107 */ 4108 int 4109 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4110 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4111 { 4112 u_offset_t eoff; 4113 lgrp_shm_policy_seg_t *next; 4114 lgrp_shm_policy_seg_t *newseg; 4115 u_offset_t off; 4116 u_offset_t oldeoff; 4117 lgrp_shm_policy_seg_t *prev; 4118 int retval; 4119 lgrp_shm_policy_seg_t *seg; 4120 lgrp_shm_locality_t *shm_locality; 4121 avl_tree_t *tree; 4122 avl_index_t where; 4123 4124 ASSERT(amp || vp); 4125 ASSERT((len & PAGEOFFSET) == 0); 4126 4127 if (len == 0) 4128 return (-1); 4129 4130 retval = 0; 4131 4132 /* 4133 * Get locality info and starting offset into shared object 4134 * Try anon map first and then vnode 4135 * Assume that no locks need to be held on anon_map or vnode, since 4136 * it should be protected by its reference count which must be nonzero 4137 * for an existing segment. 4138 */ 4139 if (amp) { 4140 /* 4141 * Get policy info from anon_map 4142 * 4143 */ 4144 ASSERT(amp->refcnt != 0); 4145 if (amp->locality == NULL) 4146 lgrp_shm_policy_init(amp, NULL); 4147 shm_locality = amp->locality; 4148 off = ptob(anon_index); 4149 } else if (vp) { 4150 /* 4151 * Get policy info from vnode 4152 */ 4153 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4154 lgrp_shm_policy_init(NULL, vp); 4155 shm_locality = vp->v_locality; 4156 ASSERT(shm_locality->loc_count != 0); 4157 off = vn_off; 4158 } else 4159 return (-1); 4160 4161 ASSERT((off & PAGEOFFSET) == 0); 4162 4163 /* 4164 * Figure out default policy 4165 */ 4166 if (policy == LGRP_MEM_POLICY_DEFAULT) 4167 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4168 4169 /* 4170 * Create AVL tree if there isn't one yet 4171 * and set locality field to point at it 4172 */ 4173 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4174 tree = shm_locality->loc_tree; 4175 if (!tree) { 4176 rw_exit(&shm_locality->loc_lock); 4177 4178 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4179 4180 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4181 if (shm_locality->loc_tree == NULL) { 4182 avl_create(tree, lgrp_shm_policy_compar, 4183 sizeof (lgrp_shm_policy_seg_t), 4184 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4185 shm_locality->loc_tree = tree; 4186 } else { 4187 /* 4188 * Another thread managed to set up the tree 4189 * before we could. Free the tree we allocated 4190 * and use the one that's already there. 4191 */ 4192 kmem_free(tree, sizeof (*tree)); 4193 tree = shm_locality->loc_tree; 4194 } 4195 } 4196 4197 /* 4198 * Set policy 4199 * 4200 * Need to maintain hold on writer's lock to keep tree from 4201 * changing out from under us 4202 */ 4203 while (len != 0) { 4204 /* 4205 * Find policy segment for specified offset into shared object 4206 */ 4207 seg = avl_find(tree, &off, &where); 4208 4209 /* 4210 * Didn't find any existing segment that contains specified 4211 * offset, so allocate new segment, insert it, and concatenate 4212 * with adjacent segments if possible 4213 */ 4214 if (seg == NULL) { 4215 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4216 KM_SLEEP); 4217 newseg->shm_policy.mem_policy = policy; 4218 newseg->shm_policy.mem_lgrpid = LGRP_NONE; 4219 newseg->shm_off = off; 4220 avl_insert(tree, newseg, where); 4221 4222 /* 4223 * Check to see whether new segment overlaps with next 4224 * one, set length of new segment accordingly, and 4225 * calculate remaining length and next offset 4226 */ 4227 seg = AVL_NEXT(tree, newseg); 4228 if (seg == NULL || off + len <= seg->shm_off) { 4229 newseg->shm_size = len; 4230 len = 0; 4231 } else { 4232 newseg->shm_size = seg->shm_off - off; 4233 off = seg->shm_off; 4234 len -= newseg->shm_size; 4235 } 4236 4237 /* 4238 * Try to concatenate new segment with next and 4239 * previous ones, since they might have the same policy 4240 * now. Grab previous and next segments first because 4241 * they will change on concatenation. 4242 */ 4243 prev = AVL_PREV(tree, newseg); 4244 next = AVL_NEXT(tree, newseg); 4245 (void) lgrp_shm_policy_concat(tree, newseg, next); 4246 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4247 4248 continue; 4249 } 4250 4251 eoff = off + len; 4252 oldeoff = seg->shm_off + seg->shm_size; 4253 4254 /* 4255 * Policy set already? 4256 */ 4257 if (policy == seg->shm_policy.mem_policy) { 4258 /* 4259 * Nothing left to do if offset and length 4260 * fall within this segment 4261 */ 4262 if (eoff <= oldeoff) { 4263 retval = 1; 4264 break; 4265 } else { 4266 len = eoff - oldeoff; 4267 off = oldeoff; 4268 continue; 4269 } 4270 } 4271 4272 /* 4273 * Specified offset and length match existing segment exactly 4274 */ 4275 if (off == seg->shm_off && len == seg->shm_size) { 4276 /* 4277 * Set policy and update current length 4278 */ 4279 seg->shm_policy.mem_policy = policy; 4280 seg->shm_policy.mem_lgrpid = LGRP_NONE; 4281 len = 0; 4282 4283 /* 4284 * Try concatenating new segment with previous and next 4285 * segments, since they might have the same policy now. 4286 * Grab previous and next segments first because they 4287 * will change on concatenation. 4288 */ 4289 prev = AVL_PREV(tree, seg); 4290 next = AVL_NEXT(tree, seg); 4291 (void) lgrp_shm_policy_concat(tree, seg, next); 4292 (void) lgrp_shm_policy_concat(tree, prev, seg); 4293 } else { 4294 /* 4295 * Specified offset and length only apply to part of 4296 * existing segment 4297 */ 4298 4299 /* 4300 * New segment starts in middle of old one, so split 4301 * new one off near beginning of old one 4302 */ 4303 newseg = NULL; 4304 if (off > seg->shm_off) { 4305 newseg = lgrp_shm_policy_split(tree, seg, off); 4306 4307 /* 4308 * New segment ends where old one did, so try 4309 * to concatenate with next segment 4310 */ 4311 if (eoff == oldeoff) { 4312 newseg->shm_policy.mem_policy = policy; 4313 newseg->shm_policy.mem_lgrpid = 4314 LGRP_NONE; 4315 (void) lgrp_shm_policy_concat(tree, 4316 newseg, AVL_NEXT(tree, newseg)); 4317 break; 4318 } 4319 } 4320 4321 /* 4322 * New segment ends before old one, so split off end of 4323 * old one 4324 */ 4325 if (eoff < oldeoff) { 4326 if (newseg) { 4327 (void) lgrp_shm_policy_split(tree, 4328 newseg, eoff); 4329 newseg->shm_policy.mem_policy = policy; 4330 newseg->shm_policy.mem_lgrpid = 4331 LGRP_NONE; 4332 } else { 4333 (void) lgrp_shm_policy_split(tree, seg, 4334 eoff); 4335 seg->shm_policy.mem_policy = policy; 4336 seg->shm_policy.mem_lgrpid = LGRP_NONE; 4337 } 4338 4339 if (off == seg->shm_off) 4340 (void) lgrp_shm_policy_concat(tree, 4341 AVL_PREV(tree, seg), seg); 4342 break; 4343 } 4344 4345 /* 4346 * Calculate remaining length and next offset 4347 */ 4348 len = eoff - oldeoff; 4349 off = oldeoff; 4350 } 4351 } 4352 4353 rw_exit(&shm_locality->loc_lock); 4354 return (retval); 4355 } 4356 4357 /* 4358 * Return the best memnode from which to allocate memory given 4359 * an lgroup. 4360 * 4361 * "c" is for cookie, which is good enough for me. 4362 * It references a cookie struct that should be zero'ed to initialize. 4363 * The cookie should live on the caller's stack. 4364 * 4365 * The routine returns -1 when: 4366 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4367 * - traverse is 1, and all the memnodes in the system have been 4368 * returned. 4369 */ 4370 int 4371 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4372 { 4373 lgrp_t *lp = c->lmc_lgrp; 4374 mnodeset_t nodes = c->lmc_nodes; 4375 int cnt = c->lmc_cnt; 4376 int offset, mnode; 4377 4378 extern int max_mem_nodes; 4379 4380 /* 4381 * If the set is empty, and the caller is willing, traverse 4382 * up the hierarchy until we find a non-empty set. 4383 */ 4384 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4385 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4386 ((lp = lp->lgrp_parent) == NULL)) 4387 return (-1); 4388 4389 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4390 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4391 } 4392 4393 /* 4394 * Select a memnode by picking one at a "random" offset. 4395 * Because of DR, memnodes can come and go at any time. 4396 * This code must be able to cope with the possibility 4397 * that the nodes count "cnt" is inconsistent with respect 4398 * to the number of elements actually in "nodes", and 4399 * therefore that the offset chosen could be greater than 4400 * the number of elements in the set (some memnodes may 4401 * have dissapeared just before cnt was read). 4402 * If this happens, the search simply wraps back to the 4403 * beginning of the set. 4404 */ 4405 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4406 offset = c->lmc_rand % cnt; 4407 do { 4408 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4409 if (nodes & ((mnodeset_t)1 << mnode)) 4410 if (!offset--) 4411 break; 4412 } while (mnode >= max_mem_nodes); 4413 4414 /* Found a node. Store state before returning. */ 4415 c->lmc_lgrp = lp; 4416 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4417 c->lmc_cnt = cnt - 1; 4418 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4419 c->lmc_ntried++; 4420 4421 return (mnode); 4422 } 4423