1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Basic NUMA support in terms of locality groups 30 * 31 * Solaris needs to know which CPUs, memory, etc. are near each other to 32 * provide good performance on NUMA machines by optimizing for locality. 33 * In order to do this, a new abstraction called a "locality group (lgroup)" 34 * has been introduced to keep track of which CPU-like and memory-like hardware 35 * resources are close to each other. Currently, latency is the only measure 36 * used to determine how to group hardware resources into lgroups, but this 37 * does not limit the groupings to be based solely on latency. Other factors 38 * may be used to determine the groupings in the future. 39 * 40 * Lgroups are organized into a hieararchy or topology that represents the 41 * latency topology of the machine. There is always at least a root lgroup in 42 * the system. It represents all the hardware resources in the machine at a 43 * latency big enough that any hardware resource can at least access any other 44 * hardware resource within that latency. A Uniform Memory Access (UMA) 45 * machine is represented with one lgroup (the root). In contrast, a NUMA 46 * machine is represented at least by the root lgroup and some number of leaf 47 * lgroups where the leaf lgroups contain the hardware resources within the 48 * least latency of each other and the root lgroup still contains all the 49 * resources in the machine. Some number of intermediate lgroups may exist 50 * which represent more levels of locality than just the local latency of the 51 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 52 * (eg. root and intermediate lgroups) contain the next nearest resources to 53 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 54 * to the root lgroup shows the hardware resources from closest to farthest 55 * from the leaf lgroup such that each successive ancestor lgroup contains 56 * the next nearest resources at the next level of locality from the previous. 57 * 58 * The kernel uses the lgroup abstraction to know how to allocate resources 59 * near a given process/thread. At fork() and lwp/thread_create() time, a 60 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 61 * with the lowest load average. Binding to a processor or processor set will 62 * change the home lgroup for a thread. The scheduler has been modified to try 63 * to dispatch a thread on a CPU in its home lgroup. Physical memory 64 * allocation is lgroup aware too, so memory will be allocated from the current 65 * thread's home lgroup if possible. If the desired resources are not 66 * available, the kernel traverses the lgroup hierarchy going to the parent 67 * lgroup to find resources at the next level of locality until it reaches the 68 * root lgroup. 69 */ 70 71 #include <sys/lgrp.h> 72 #include <sys/lgrp_user.h> 73 #include <sys/types.h> 74 #include <sys/mman.h> 75 #include <sys/param.h> 76 #include <sys/var.h> 77 #include <sys/thread.h> 78 #include <sys/cpuvar.h> 79 #include <sys/cpupart.h> 80 #include <sys/kmem.h> 81 #include <vm/seg.h> 82 #include <vm/seg_kmem.h> 83 #include <vm/seg_spt.h> 84 #include <vm/seg_vn.h> 85 #include <vm/as.h> 86 #include <sys/atomic.h> 87 #include <sys/systm.h> 88 #include <sys/errno.h> 89 #include <sys/cmn_err.h> 90 #include <sys/kstat.h> 91 #include <sys/sysmacros.h> 92 #include <sys/chip.h> 93 #include <sys/promif.h> 94 #include <sys/sdt.h> 95 96 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 98 /* indexed by lgrp_id */ 99 int nlgrps; /* number of lgroups in machine */ 100 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 101 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 102 103 /* 104 * Kstat data for lgroups. 105 * 106 * Actual kstat data is collected in lgrp_stats array. 107 * The lgrp_kstat_data array of named kstats is used to extract data from 108 * lgrp_stats and present it to kstat framework. It is protected from partallel 109 * modifications by lgrp_kstat_mutex. This may cause some contention when 110 * several kstat commands run in parallel but this is not the 111 * performance-critical path. 112 */ 113 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 114 115 /* 116 * Declare kstat names statically for enums as defined in the header file. 117 */ 118 LGRP_KSTAT_NAMES; 119 120 static void lgrp_kstat_init(void); 121 static int lgrp_kstat_extract(kstat_t *, int); 122 static void lgrp_kstat_reset(lgrp_id_t); 123 124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 125 static kmutex_t lgrp_kstat_mutex; 126 127 128 /* 129 * max number of lgroups supported by the platform 130 */ 131 int nlgrpsmax = 0; 132 133 /* 134 * The root lgroup. Represents the set of resources at the system wide 135 * level of locality. 136 */ 137 lgrp_t *lgrp_root = NULL; 138 139 /* 140 * During system bootstrap cp_default does not contain the list of lgrp load 141 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 142 * on-line when cp_default is initialized by cpupart_initialize_default(). 143 * Configuring CPU0 may create a two-level topology with root and one leaf node 144 * containing CPU0. This topology is initially constructed in a special 145 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 146 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 147 * for all lpl operations until cp_default is fully constructed. 148 * 149 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 150 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 151 * the first element of lpl_bootstrap_list. 152 * 153 * CPUs that are added to the system, but have not yet been assigned to an 154 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 155 * on some architectures (x86) it's possible for the slave CPU startup thread 156 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 157 */ 158 #define LPL_BOOTSTRAP_SIZE 2 159 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 160 lpl_t *lpl_bootstrap; 161 162 /* 163 * If cp still references the bootstrap lpl, it has not yet been added to 164 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 165 * a thread is trying to allocate memory close to a CPU that has no lgrp. 166 */ 167 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 168 169 static lgrp_t lroot; 170 171 172 /* 173 * Size, in bytes, beyond which random memory allocation policy is applied 174 * to non-shared memory. Default is the maximum size, so random memory 175 * allocation won't be used for non-shared memory by default. 176 */ 177 size_t lgrp_privm_random_thresh = (size_t)(-1); 178 179 /* 180 * Size, in bytes, beyond which random memory allocation policy is applied to 181 * shared memory. Default is 8MB (2 ISM pages). 182 */ 183 size_t lgrp_shm_random_thresh = 8*1024*1024; 184 185 /* 186 * Whether to do processor set aware memory allocation by default 187 */ 188 int lgrp_mem_pset_aware = 0; 189 190 /* 191 * Set the default memory allocation policy for root lgroup 192 */ 193 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 194 195 /* 196 * Set the default memory allocation policy. For most platforms, 197 * next touch is sufficient, but some platforms may wish to override 198 * this. 199 */ 200 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 201 202 203 /* 204 * lgroup CPU event handlers 205 */ 206 static void lgrp_cpu_init(struct cpu *); 207 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 208 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 209 210 static void lgrp_latency_change(u_longlong_t, u_longlong_t); 211 212 /* 213 * lgroup memory event handlers 214 */ 215 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 216 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 217 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 218 219 /* 220 * lgroup CPU partition event handlers 221 */ 222 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 223 static void lgrp_part_del_cpu(struct cpu *); 224 225 static void lgrp_root_init(void); 226 227 /* 228 * lpl topology 229 */ 230 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 231 static void lpl_clear(lpl_t *); 232 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 233 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 234 static void lpl_rset_add(lpl_t *, lpl_t *); 235 static void lpl_rset_del(lpl_t *, lpl_t *); 236 static int lpl_rset_contains(lpl_t *, lpl_t *); 237 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 238 static void lpl_child_update(lpl_t *, struct cpupart *); 239 static int lpl_pick(lpl_t *, lpl_t *); 240 static void lpl_verify_wrapper(struct cpupart *); 241 242 /* 243 * defines for lpl topology verifier return codes 244 */ 245 246 #define LPL_TOPO_CORRECT 0 247 #define LPL_TOPO_PART_HAS_NO_LPL -1 248 #define LPL_TOPO_CPUS_NOT_EMPTY -2 249 #define LPL_TOPO_LGRP_MISMATCH -3 250 #define LPL_TOPO_MISSING_PARENT -4 251 #define LPL_TOPO_PARENT_MISMATCH -5 252 #define LPL_TOPO_BAD_CPUCNT -6 253 #define LPL_TOPO_RSET_MISMATCH -7 254 #define LPL_TOPO_LPL_ORPHANED -8 255 #define LPL_TOPO_LPL_BAD_NCPU -9 256 #define LPL_TOPO_RSET_MSSNG_LF -10 257 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 258 #define LPL_TOPO_BOGUS_HINT -12 259 #define LPL_TOPO_NONLEAF_HAS_CPUS -13 260 #define LPL_TOPO_LGRP_NOT_LEAF -14 261 #define LPL_TOPO_BAD_RSETCNT -15 262 263 /* 264 * Return whether lgroup optimizations should be enabled on this system 265 */ 266 int 267 lgrp_optimizations(void) 268 { 269 /* 270 * System must have more than 2 lgroups to enable lgroup optimizations 271 * 272 * XXX This assumes that a 2 lgroup system has an empty root lgroup 273 * with one child lgroup containing all the resources. A 2 lgroup 274 * system with a root lgroup directly containing CPUs or memory might 275 * need lgroup optimizations with its child lgroup, but there 276 * isn't such a machine for now.... 277 */ 278 if (nlgrps > 2) 279 return (1); 280 281 return (0); 282 } 283 284 /* 285 * Build full lgroup topology 286 */ 287 static void 288 lgrp_root_init(void) 289 { 290 lgrp_handle_t hand; 291 int i; 292 lgrp_id_t id; 293 294 /* 295 * Create the "root" lgroup 296 */ 297 ASSERT(nlgrps == 0); 298 id = nlgrps++; 299 300 lgrp_root = &lroot; 301 302 lgrp_root->lgrp_cpu = NULL; 303 lgrp_root->lgrp_mnodes = 0; 304 lgrp_root->lgrp_nmnodes = 0; 305 hand = lgrp_plat_root_hand(); 306 lgrp_root->lgrp_plathand = hand; 307 308 lgrp_root->lgrp_id = id; 309 lgrp_root->lgrp_cpucnt = 0; 310 lgrp_root->lgrp_childcnt = 0; 311 klgrpset_clear(lgrp_root->lgrp_children); 312 klgrpset_clear(lgrp_root->lgrp_leaves); 313 lgrp_root->lgrp_parent = NULL; 314 lgrp_root->lgrp_chips = NULL; 315 lgrp_root->lgrp_chipcnt = 0; 316 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 317 318 for (i = 0; i < LGRP_RSRC_COUNT; i++) 319 klgrpset_clear(lgrp_root->lgrp_set[i]); 320 321 lgrp_root->lgrp_kstat = NULL; 322 323 lgrp_table[id] = lgrp_root; 324 325 /* 326 * Setup initial lpl list for CPU0 and initial t0 home. 327 * The only lpl space we have so far is lpl_bootstrap. It is used for 328 * all topology operations until cp_default is initialized at which 329 * point t0.t_lpl will be updated. 330 */ 331 lpl_bootstrap = lpl_bootstrap_list; 332 t0.t_lpl = lpl_bootstrap; 333 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 334 lpl_bootstrap_list[1].lpl_lgrpid = 1; 335 cp_default.cp_lgrploads = lpl_bootstrap; 336 } 337 338 /* 339 * Initialize the lgroup framework and allow the platform to do the same 340 */ 341 void 342 lgrp_init(void) 343 { 344 /* 345 * Initialize the platform 346 */ 347 lgrp_plat_init(); 348 349 /* 350 * Set max number of lgroups supported on this platform which must be 351 * less than the max number of lgroups supported by the common lgroup 352 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 353 */ 354 nlgrpsmax = lgrp_plat_max_lgrps(); 355 ASSERT(nlgrpsmax <= NLGRPS_MAX); 356 } 357 358 /* 359 * Create the root and cpu0's lgroup, and set t0's home. 360 */ 361 void 362 lgrp_setup(void) 363 { 364 /* 365 * Setup the root lgroup 366 */ 367 lgrp_root_init(); 368 369 /* 370 * Add cpu0 to an lgroup 371 */ 372 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 373 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 374 } 375 376 /* 377 * Lgroup initialization is split in two parts. The first part 378 * (lgrp_main_init()) is called right before start_other_cpus() in main. The 379 * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 380 * when all CPUs are brought online and all distance information is available. 381 * 382 * When lgrp_main_init() is complete it sets lgrp_initialized. The 383 * lgrp_main_mp_init() sets lgrp_topo_initialized. 384 */ 385 386 /* 387 * true when lgrp initialization has been completed. 388 */ 389 int lgrp_initialized = 0; 390 391 /* 392 * True when lgrp topology is constructed. 393 */ 394 int lgrp_topo_initialized = 0; 395 396 /* 397 * Init routine called after startup(), /etc/system has been processed, 398 * and cpu0 has been added to an lgroup. 399 */ 400 void 401 lgrp_main_init(void) 402 { 403 cpu_t *cp = CPU; 404 lgrp_id_t lgrpid; 405 int i; 406 /* 407 * Enforce a valid lgrp_mem_default_policy 408 */ 409 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 410 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES)) 411 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 412 413 /* 414 * See if mpo should be disabled. 415 * This may happen in the case of null proc LPA on Starcat. 416 * The platform won't be able to detect null proc LPA until after 417 * cpu0 and memory have already been added to lgroups. 418 * When and if it is detected, the Starcat platform will return 419 * a different platform handle for cpu0 which is what we check for 420 * here. If mpo should be disabled move cpu0 to it's rightful place 421 * (the root), and destroy the remaining lgroups. This effectively 422 * provides an UMA lgroup topology. 423 */ 424 lgrpid = cp->cpu_lpl->lpl_lgrpid; 425 if (lgrp_table[lgrpid]->lgrp_plathand != 426 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 427 lgrp_part_del_cpu(cp); 428 lgrp_cpu_fini(cp, lgrpid); 429 430 lgrp_cpu_init(cp); 431 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 432 433 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 434 435 /* 436 * Destroy all lgroups except for root 437 */ 438 for (i = 0; i <= lgrp_alloc_max; i++) { 439 if (LGRP_EXISTS(lgrp_table[i]) && 440 lgrp_table[i] != lgrp_root) 441 lgrp_destroy(lgrp_table[i]); 442 } 443 444 /* 445 * Fix up root to point at itself for leaves and resources 446 * and not have any children 447 */ 448 lgrp_root->lgrp_childcnt = 0; 449 klgrpset_clear(lgrp_root->lgrp_children); 450 klgrpset_clear(lgrp_root->lgrp_leaves); 451 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 452 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 453 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 454 } 455 456 /* 457 * Initialize kstats framework. 458 */ 459 lgrp_kstat_init(); 460 /* 461 * cpu0 is finally where it should be, so create it's lgroup's kstats 462 */ 463 mutex_enter(&cpu_lock); 464 lgrp_kstat_create(cp); 465 mutex_exit(&cpu_lock); 466 467 lgrp_plat_main_init(); 468 lgrp_initialized = 1; 469 } 470 471 /* 472 * Finish lgrp initialization after all CPUS are brought on-line. 473 * This routine is called after start_other_cpus(). 474 */ 475 void 476 lgrp_main_mp_init(void) 477 { 478 klgrpset_t changed; 479 480 /* 481 * Update lgroup topology (if necessary) 482 */ 483 klgrpset_clear(changed); 484 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 485 lgrp_topo_initialized = 1; 486 } 487 488 /* 489 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 490 */ 491 void 492 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 493 { 494 klgrpset_t changed; 495 cpu_t *cp; 496 lgrp_id_t id; 497 int rc; 498 499 switch (event) { 500 /* 501 * The following (re)configuration events are common code 502 * initiated. lgrp_plat_config() is called here to inform the 503 * platform of the reconfiguration event. 504 */ 505 case LGRP_CONFIG_CPU_ADD: 506 cp = (cpu_t *)resource; 507 508 /* 509 * Initialize the new CPU's lgrp related next/prev 510 * links, and give it a bootstrap lpl so that it can 511 * survive should it need to enter the dispatcher. 512 */ 513 cp->cpu_next_lpl = cp; 514 cp->cpu_prev_lpl = cp; 515 cp->cpu_next_lgrp = cp; 516 cp->cpu_prev_lgrp = cp; 517 cp->cpu_lpl = lpl_bootstrap; 518 519 lgrp_plat_config(event, resource); 520 atomic_add_32(&lgrp_gen, 1); 521 522 break; 523 case LGRP_CONFIG_CPU_DEL: 524 lgrp_plat_config(event, resource); 525 atomic_add_32(&lgrp_gen, 1); 526 527 break; 528 case LGRP_CONFIG_CPU_ONLINE: 529 cp = (cpu_t *)resource; 530 lgrp_cpu_init(cp); 531 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 532 rc = lpl_topo_verify(cp->cpu_part); 533 if (rc != LPL_TOPO_CORRECT) { 534 panic("lpl_topo_verify failed: %d", rc); 535 } 536 lgrp_plat_config(event, resource); 537 atomic_add_32(&lgrp_gen, 1); 538 539 break; 540 case LGRP_CONFIG_CPU_OFFLINE: 541 cp = (cpu_t *)resource; 542 id = cp->cpu_lpl->lpl_lgrpid; 543 lgrp_part_del_cpu(cp); 544 lgrp_cpu_fini(cp, id); 545 rc = lpl_topo_verify(cp->cpu_part); 546 if (rc != LPL_TOPO_CORRECT) { 547 panic("lpl_topo_verify failed: %d", rc); 548 } 549 lgrp_plat_config(event, resource); 550 atomic_add_32(&lgrp_gen, 1); 551 552 break; 553 case LGRP_CONFIG_CPUPART_ADD: 554 cp = (cpu_t *)resource; 555 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 556 rc = lpl_topo_verify(cp->cpu_part); 557 if (rc != LPL_TOPO_CORRECT) { 558 panic("lpl_topo_verify failed: %d", rc); 559 } 560 lgrp_plat_config(event, resource); 561 562 break; 563 case LGRP_CONFIG_CPUPART_DEL: 564 cp = (cpu_t *)resource; 565 lgrp_part_del_cpu((cpu_t *)resource); 566 rc = lpl_topo_verify(cp->cpu_part); 567 if (rc != LPL_TOPO_CORRECT) { 568 panic("lpl_topo_verify failed: %d", rc); 569 } 570 lgrp_plat_config(event, resource); 571 572 break; 573 /* 574 * The following events are initiated by the memnode 575 * subsystem. 576 */ 577 case LGRP_CONFIG_MEM_ADD: 578 lgrp_mem_init((int)resource, where, B_FALSE); 579 atomic_add_32(&lgrp_gen, 1); 580 581 break; 582 case LGRP_CONFIG_MEM_DEL: 583 lgrp_mem_fini((int)resource, where, B_FALSE); 584 atomic_add_32(&lgrp_gen, 1); 585 586 break; 587 case LGRP_CONFIG_MEM_RENAME: { 588 lgrp_config_mem_rename_t *ren_arg = 589 (lgrp_config_mem_rename_t *)where; 590 591 lgrp_mem_rename((int)resource, 592 ren_arg->lmem_rename_from, 593 ren_arg->lmem_rename_to); 594 atomic_add_32(&lgrp_gen, 1); 595 596 break; 597 } 598 case LGRP_CONFIG_GEN_UPDATE: 599 atomic_add_32(&lgrp_gen, 1); 600 601 break; 602 case LGRP_CONFIG_FLATTEN: 603 if (where == 0) 604 lgrp_topo_levels = (int)resource; 605 else 606 (void) lgrp_topo_flatten(resource, 607 lgrp_table, lgrp_alloc_max, &changed); 608 609 break; 610 /* 611 * Initiated by platform latency probing code 612 */ 613 case LGRP_CONFIG_LATENCY_CHANGE: 614 lgrp_latency_change((u_longlong_t)resource, 615 (u_longlong_t)where); 616 617 break; 618 case LGRP_CONFIG_NOP: 619 620 break; 621 default: 622 break; 623 } 624 625 } 626 627 /* 628 * Called to add lgrp info into cpu structure from cpu_add_unit; 629 * do not assume cpu is in cpu[] yet! 630 * 631 * CPUs are brought online with all other CPUs paused so we can't 632 * allocate memory or we could deadlock the system, so we rely on 633 * the platform to statically allocate as much space as we need 634 * for the lgrp structs and stats. 635 */ 636 static void 637 lgrp_cpu_init(struct cpu *cp) 638 { 639 klgrpset_t changed; 640 int count; 641 lgrp_handle_t hand; 642 int first_cpu; 643 lgrp_t *my_lgrp; 644 lgrp_id_t lgrpid; 645 struct cpu *cptr; 646 struct chip *chp; 647 648 /* 649 * This is the first time through if the resource set 650 * for the root lgroup is empty. After cpu0 has been 651 * initially added to an lgroup, the root's CPU resource 652 * set can never be empty, since the system's last CPU 653 * cannot be offlined. 654 */ 655 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 656 /* 657 * First time through. 658 */ 659 first_cpu = 1; 660 } else { 661 /* 662 * If cpu0 needs to move lgroups, we may come 663 * through here again, at which time cpu_lock won't 664 * be held, and lgrp_initialized will be false. 665 */ 666 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 667 ASSERT(cp->cpu_part != NULL); 668 first_cpu = 0; 669 } 670 671 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 672 my_lgrp = lgrp_hand_to_lgrp(hand); 673 674 if (my_lgrp == NULL) { 675 /* 676 * Create new lgrp and add it to lgroup topology 677 */ 678 my_lgrp = lgrp_create(); 679 my_lgrp->lgrp_plathand = hand; 680 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 681 lgrpid = my_lgrp->lgrp_id; 682 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 683 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 684 685 count = 0; 686 klgrpset_clear(changed); 687 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 688 &changed); 689 /* 690 * May have added new intermediate lgroups, so need to add 691 * resources other than CPUs which are added below 692 */ 693 (void) lgrp_mnode_update(changed, NULL); 694 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 695 > 0) { 696 /* 697 * Leaf lgroup was created, but latency wasn't available 698 * then. So, set latency for it and fill in rest of lgroup 699 * topology now that we know how far it is from other leaf 700 * lgroups. 701 */ 702 lgrpid = my_lgrp->lgrp_id; 703 klgrpset_clear(changed); 704 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 705 lgrpid)) 706 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 707 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 708 &changed); 709 710 /* 711 * May have added new intermediate lgroups, so need to add 712 * resources other than CPUs which are added below 713 */ 714 (void) lgrp_mnode_update(changed, NULL); 715 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 716 my_lgrp->lgrp_id)) { 717 int i; 718 719 /* 720 * Update existing lgroup and lgroups containing it with CPU 721 * resource 722 */ 723 lgrpid = my_lgrp->lgrp_id; 724 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 725 for (i = 0; i <= lgrp_alloc_max; i++) { 726 lgrp_t *lgrp; 727 728 lgrp = lgrp_table[i]; 729 if (!LGRP_EXISTS(lgrp) || 730 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 731 continue; 732 733 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 734 } 735 } 736 737 lgrpid = my_lgrp->lgrp_id; 738 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 739 740 /* 741 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 742 * end up in lpl for lgroup 0 whether it is supposed to be in there or 743 * not since none of lgroup IDs in the lpl's have been set yet. 744 */ 745 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 746 cp->cpu_lpl->lpl_lgrpid = lgrpid; 747 748 /* 749 * link the CPU into the lgrp's CPU list 750 */ 751 if (my_lgrp->lgrp_cpucnt == 0) { 752 my_lgrp->lgrp_cpu = cp; 753 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 754 } else { 755 cptr = my_lgrp->lgrp_cpu; 756 cp->cpu_next_lgrp = cptr; 757 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 758 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 759 cptr->cpu_prev_lgrp = cp; 760 } 761 my_lgrp->lgrp_cpucnt++; 762 763 /* 764 * Add this cpu's chip to the per lgroup list 765 * if necessary 766 */ 767 if (cp->cpu_chip->chip_lgrp == NULL) { 768 struct chip *lcpr; 769 770 chp = cp->cpu_chip; 771 772 if (my_lgrp->lgrp_chipcnt == 0) { 773 my_lgrp->lgrp_chips = chp; 774 chp->chip_next_lgrp = 775 chp->chip_prev_lgrp = chp; 776 } else { 777 lcpr = my_lgrp->lgrp_chips; 778 chp->chip_next_lgrp = lcpr; 779 chp->chip_prev_lgrp = 780 lcpr->chip_prev_lgrp; 781 lcpr->chip_prev_lgrp->chip_next_lgrp = 782 chp; 783 lcpr->chip_prev_lgrp = chp; 784 } 785 chp->chip_lgrp = my_lgrp; 786 chp->chip_balance = chp->chip_next_lgrp; 787 my_lgrp->lgrp_chipcnt++; 788 } 789 } 790 791 lgrp_t * 792 lgrp_create(void) 793 { 794 lgrp_t *my_lgrp; 795 lgrp_id_t lgrpid; 796 int i; 797 798 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 799 800 /* 801 * Find an open slot in the lgroup table and recycle unused lgroup 802 * left there if any 803 */ 804 my_lgrp = NULL; 805 if (lgrp_alloc_hint == -1) 806 /* 807 * Allocate from end when hint not set yet because no lgroups 808 * have been deleted yet 809 */ 810 lgrpid = nlgrps++; 811 else { 812 /* 813 * Start looking for next open slot from hint and leave hint 814 * at slot allocated 815 */ 816 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 817 my_lgrp = lgrp_table[i]; 818 if (!LGRP_EXISTS(my_lgrp)) { 819 lgrpid = i; 820 nlgrps++; 821 break; 822 } 823 } 824 lgrp_alloc_hint = lgrpid; 825 } 826 827 /* 828 * Keep track of max lgroup ID allocated so far to cut down on searches 829 */ 830 if (lgrpid > lgrp_alloc_max) 831 lgrp_alloc_max = lgrpid; 832 833 /* 834 * Need to allocate new lgroup if next open slot didn't have one 835 * for recycling 836 */ 837 if (my_lgrp == NULL) 838 my_lgrp = lgrp_plat_alloc(lgrpid); 839 840 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 841 panic("Too many lgrps for platform (%d)", nlgrps); 842 843 my_lgrp->lgrp_id = lgrpid; 844 my_lgrp->lgrp_latency = 0; 845 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 846 my_lgrp->lgrp_parent = NULL; 847 my_lgrp->lgrp_childcnt = 0; 848 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 849 my_lgrp->lgrp_nmnodes = 0; 850 klgrpset_clear(my_lgrp->lgrp_children); 851 klgrpset_clear(my_lgrp->lgrp_leaves); 852 for (i = 0; i < LGRP_RSRC_COUNT; i++) 853 klgrpset_clear(my_lgrp->lgrp_set[i]); 854 855 my_lgrp->lgrp_cpu = NULL; 856 my_lgrp->lgrp_cpucnt = 0; 857 my_lgrp->lgrp_chips = NULL; 858 my_lgrp->lgrp_chipcnt = 0; 859 860 if (my_lgrp->lgrp_kstat != NULL) 861 lgrp_kstat_reset(lgrpid); 862 863 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 864 865 return (my_lgrp); 866 } 867 868 void 869 lgrp_destroy(lgrp_t *lgrp) 870 { 871 int i; 872 873 /* 874 * Unless this lgroup is being destroyed on behalf of 875 * the boot CPU, cpu_lock must be held 876 */ 877 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 878 879 if (nlgrps == 1) 880 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 881 882 if (!LGRP_EXISTS(lgrp)) 883 return; 884 885 /* 886 * Set hint to lgroup being deleted and try to keep lower numbered 887 * hints to facilitate finding empty slots 888 */ 889 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 890 lgrp_alloc_hint = lgrp->lgrp_id; 891 892 /* 893 * Mark this lgroup to be recycled by setting its lgroup ID to 894 * LGRP_NONE and clear relevant fields 895 */ 896 lgrp->lgrp_id = LGRP_NONE; 897 lgrp->lgrp_latency = 0; 898 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 899 lgrp->lgrp_parent = NULL; 900 lgrp->lgrp_childcnt = 0; 901 902 klgrpset_clear(lgrp->lgrp_children); 903 klgrpset_clear(lgrp->lgrp_leaves); 904 for (i = 0; i < LGRP_RSRC_COUNT; i++) 905 klgrpset_clear(lgrp->lgrp_set[i]); 906 907 lgrp->lgrp_mnodes = (mnodeset_t)0; 908 lgrp->lgrp_nmnodes = 0; 909 910 lgrp->lgrp_cpu = NULL; 911 lgrp->lgrp_cpucnt = 0; 912 lgrp->lgrp_chipcnt = 0; 913 lgrp->lgrp_chips = NULL; 914 915 nlgrps--; 916 } 917 918 /* 919 * Initialize kstat data. Called from lgrp intialization code. 920 */ 921 static void 922 lgrp_kstat_init(void) 923 { 924 lgrp_stat_t stat; 925 926 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 927 928 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 929 kstat_named_init(&lgrp_kstat_data[stat], 930 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 931 } 932 933 /* 934 * initialize an lgrp's kstats if needed 935 * called with cpu_lock held but not with cpus paused. 936 * we don't tear these down now because we don't know about 937 * memory leaving the lgrp yet... 938 */ 939 940 void 941 lgrp_kstat_create(cpu_t *cp) 942 { 943 kstat_t *lgrp_kstat; 944 lgrp_id_t lgrpid; 945 lgrp_t *my_lgrp; 946 947 ASSERT(MUTEX_HELD(&cpu_lock)); 948 949 lgrpid = cp->cpu_lpl->lpl_lgrpid; 950 my_lgrp = lgrp_table[lgrpid]; 951 952 if (my_lgrp->lgrp_kstat != NULL) 953 return; /* already initialized */ 954 955 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 956 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 957 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 958 959 if (lgrp_kstat != NULL) { 960 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 961 lgrp_kstat->ks_private = my_lgrp; 962 lgrp_kstat->ks_data = &lgrp_kstat_data; 963 lgrp_kstat->ks_update = lgrp_kstat_extract; 964 my_lgrp->lgrp_kstat = lgrp_kstat; 965 kstat_install(lgrp_kstat); 966 } 967 } 968 969 /* 970 * this will do something when we manage to remove now unused lgrps 971 */ 972 973 /* ARGSUSED */ 974 void 975 lgrp_kstat_destroy(cpu_t *cp) 976 { 977 ASSERT(MUTEX_HELD(&cpu_lock)); 978 } 979 980 /* 981 * Called when a CPU is off-lined. 982 */ 983 static void 984 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 985 { 986 lgrp_t *my_lgrp; 987 struct cpu *prev; 988 struct cpu *next; 989 chip_t *chp; 990 991 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 992 993 prev = cp->cpu_prev_lgrp; 994 next = cp->cpu_next_lgrp; 995 996 prev->cpu_next_lgrp = next; 997 next->cpu_prev_lgrp = prev; 998 999 /* 1000 * just because I'm paranoid doesn't mean... 1001 */ 1002 1003 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 1004 1005 my_lgrp = lgrp_table[lgrpid]; 1006 my_lgrp->lgrp_cpucnt--; 1007 1008 /* 1009 * If the last CPU on it's chip is being offlined 1010 * then remove this chip from the per lgroup list. 1011 * 1012 * This is also done for the boot CPU when it needs 1013 * to move between lgroups as a consequence of 1014 * null proc lpa. 1015 */ 1016 chp = cp->cpu_chip; 1017 if (chp->chip_ncpu == 0 || !lgrp_initialized) { 1018 1019 chip_t *chpp; 1020 1021 if (--my_lgrp->lgrp_chipcnt == 0) 1022 my_lgrp->lgrp_chips = NULL; 1023 else if (my_lgrp->lgrp_chips == chp) 1024 my_lgrp->lgrp_chips = chp->chip_next_lgrp; 1025 1026 /* 1027 * Walk this lgroup's chip list looking for chips that 1028 * may try to balance against the one that's leaving 1029 */ 1030 for (chpp = chp->chip_next_lgrp; chpp != chp; 1031 chpp = chpp->chip_next_lgrp) { 1032 if (chpp->chip_balance == chp) 1033 chpp->chip_balance = chp->chip_next_lgrp; 1034 } 1035 1036 chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp; 1037 chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp; 1038 1039 chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL; 1040 chp->chip_lgrp = NULL; 1041 chp->chip_balance = NULL; 1042 } 1043 1044 /* 1045 * Removing last CPU in lgroup, so update lgroup topology 1046 */ 1047 if (my_lgrp->lgrp_cpucnt == 0) { 1048 klgrpset_t changed; 1049 int count; 1050 int i; 1051 1052 my_lgrp->lgrp_cpu = NULL; 1053 1054 /* 1055 * Remove this lgroup from its lgroup CPU resources and remove 1056 * lgroup from lgroup topology if it doesn't have any more 1057 * resources in it now 1058 */ 1059 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1060 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1061 count = 0; 1062 klgrpset_clear(changed); 1063 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1064 lgrp_alloc_max + 1, &changed); 1065 return; 1066 } 1067 1068 /* 1069 * This lgroup isn't empty, so just remove it from CPU 1070 * resources of any lgroups that contain it as such 1071 */ 1072 for (i = 0; i <= lgrp_alloc_max; i++) { 1073 lgrp_t *lgrp; 1074 1075 lgrp = lgrp_table[i]; 1076 if (!LGRP_EXISTS(lgrp) || 1077 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1078 lgrpid)) 1079 continue; 1080 1081 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1082 } 1083 return; 1084 } 1085 1086 if (my_lgrp->lgrp_cpu == cp) 1087 my_lgrp->lgrp_cpu = next; 1088 1089 } 1090 1091 /* 1092 * Update memory nodes in target lgroups and return ones that get changed 1093 */ 1094 int 1095 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1096 { 1097 int count; 1098 int i; 1099 int j; 1100 lgrp_t *lgrp; 1101 lgrp_t *lgrp_rsrc; 1102 1103 count = 0; 1104 if (changed) 1105 klgrpset_clear(*changed); 1106 1107 if (klgrpset_isempty(target)) 1108 return (0); 1109 1110 /* 1111 * Find each lgroup in target lgroups 1112 */ 1113 for (i = 0; i <= lgrp_alloc_max; i++) { 1114 /* 1115 * Skip any lgroups that don't exist or aren't in target group 1116 */ 1117 lgrp = lgrp_table[i]; 1118 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1119 continue; 1120 } 1121 1122 /* 1123 * Initialize memnodes for intermediate lgroups to 0 1124 * and update them from scratch since they may have completely 1125 * changed 1126 */ 1127 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1128 lgrp->lgrp_mnodes = (mnodeset_t)0; 1129 lgrp->lgrp_nmnodes = 0; 1130 } 1131 1132 /* 1133 * Update memory nodes of of target lgroup with memory nodes 1134 * from each lgroup in its lgroup memory resource set 1135 */ 1136 for (j = 0; j <= lgrp_alloc_max; j++) { 1137 int k; 1138 1139 /* 1140 * Skip any lgroups that don't exist or aren't in 1141 * memory resources of target lgroup 1142 */ 1143 lgrp_rsrc = lgrp_table[j]; 1144 if (!LGRP_EXISTS(lgrp_rsrc) || 1145 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1146 j)) 1147 continue; 1148 1149 /* 1150 * Update target lgroup's memnodes to include memnodes 1151 * of this lgroup 1152 */ 1153 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1154 mnodeset_t mnode_mask; 1155 1156 mnode_mask = (mnodeset_t)1 << k; 1157 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1158 !(lgrp->lgrp_mnodes & mnode_mask)) { 1159 lgrp->lgrp_mnodes |= mnode_mask; 1160 lgrp->lgrp_nmnodes++; 1161 } 1162 } 1163 count++; 1164 if (changed) 1165 klgrpset_add(*changed, lgrp->lgrp_id); 1166 } 1167 } 1168 1169 return (count); 1170 } 1171 1172 /* 1173 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1174 * is moved from one board to another. The "from" and "to" arguments specify the 1175 * source and the destination of the move. 1176 * 1177 * See plat_lgrp_config() for a detailed description of the copy-rename 1178 * semantics. 1179 * 1180 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1181 * the lgroup topology which is changing as memory moves from one lgroup to 1182 * another. It removes the mnode from the source lgroup and re-inserts it in the 1183 * target lgroup. 1184 * 1185 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1186 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1187 * copy-rename operation. 1188 * 1189 * There is one case which requires special handling. If the system contains 1190 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1191 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1192 * lgrp_mem_init), but there is a window when the system has no memory in the 1193 * lgroup hierarchy. If another thread tries to allocate memory during this 1194 * window, the allocation will fail, although the system has physical memory. 1195 * This may cause a system panic or a deadlock (some sleeping memory allocations 1196 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1197 * the mnode back). 1198 * 1199 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1200 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1201 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1202 * but it updates the rest of the lgroup topology as if the mnode was actually 1203 * removed. The lgrp_mem_init() function recognizes that the mnode being 1204 * inserted represents such a special case and updates the topology 1205 * appropriately. 1206 */ 1207 void 1208 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1209 { 1210 /* 1211 * Remove the memory from the source node and add it to the destination 1212 * node. 1213 */ 1214 lgrp_mem_fini(mnode, from, B_TRUE); 1215 lgrp_mem_init(mnode, to, B_TRUE); 1216 } 1217 1218 /* 1219 * Called to indicate that the lgrp with platform handle "hand" now 1220 * contains the memory identified by "mnode". 1221 * 1222 * LOCKING for this routine is a bit tricky. Usually it is called without 1223 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1224 * callers. During DR of the board containing the caged memory it may be called 1225 * with cpu_lock already held and CPUs paused. 1226 * 1227 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1228 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1229 * dealing with the special case of DR copy-rename described in 1230 * lgrp_mem_rename(). 1231 */ 1232 void 1233 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1234 { 1235 klgrpset_t changed; 1236 int count; 1237 int i; 1238 lgrp_t *my_lgrp; 1239 lgrp_id_t lgrpid; 1240 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1241 boolean_t drop_lock = B_FALSE; 1242 boolean_t need_synch = B_FALSE; 1243 1244 /* 1245 * Grab CPU lock (if we haven't already) 1246 */ 1247 if (!MUTEX_HELD(&cpu_lock)) { 1248 mutex_enter(&cpu_lock); 1249 drop_lock = B_TRUE; 1250 } 1251 1252 /* 1253 * This routine may be called from a context where we already 1254 * hold cpu_lock, and have already paused cpus. 1255 */ 1256 if (!cpus_paused()) 1257 need_synch = B_TRUE; 1258 1259 /* 1260 * Check if this mnode is already configured and return immediately if 1261 * it is. 1262 * 1263 * NOTE: in special case of copy-rename of the only remaining mnode, 1264 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1265 * recognize this case and continue as usual, but skip the update to 1266 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1267 * in topology, temporarily introduced by lgrp_mem_fini(). 1268 */ 1269 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1270 lgrp_root->lgrp_mnodes & mnodes_mask) { 1271 if (drop_lock) 1272 mutex_exit(&cpu_lock); 1273 return; 1274 } 1275 1276 /* 1277 * Update lgroup topology with new memory resources, keeping track of 1278 * which lgroups change 1279 */ 1280 count = 0; 1281 klgrpset_clear(changed); 1282 my_lgrp = lgrp_hand_to_lgrp(hand); 1283 if (my_lgrp == NULL) { 1284 /* new lgrp */ 1285 my_lgrp = lgrp_create(); 1286 lgrpid = my_lgrp->lgrp_id; 1287 my_lgrp->lgrp_plathand = hand; 1288 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1289 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1290 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1291 1292 if (need_synch) 1293 pause_cpus(NULL); 1294 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1295 &changed); 1296 if (need_synch) 1297 start_cpus(); 1298 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1299 > 0) { 1300 /* 1301 * Leaf lgroup was created, but latency wasn't available 1302 * then. So, set latency for it and fill in rest of lgroup 1303 * topology now that we know how far it is from other leaf 1304 * lgroups. 1305 */ 1306 klgrpset_clear(changed); 1307 lgrpid = my_lgrp->lgrp_id; 1308 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1309 lgrpid)) 1310 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1311 if (need_synch) 1312 pause_cpus(NULL); 1313 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1314 &changed); 1315 if (need_synch) 1316 start_cpus(); 1317 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1318 my_lgrp->lgrp_id)) { 1319 /* 1320 * Add new lgroup memory resource to existing lgroup 1321 */ 1322 lgrpid = my_lgrp->lgrp_id; 1323 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1324 klgrpset_add(changed, lgrpid); 1325 count++; 1326 for (i = 0; i <= lgrp_alloc_max; i++) { 1327 lgrp_t *lgrp; 1328 1329 lgrp = lgrp_table[i]; 1330 if (!LGRP_EXISTS(lgrp) || 1331 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1332 continue; 1333 1334 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1335 klgrpset_add(changed, lgrp->lgrp_id); 1336 count++; 1337 } 1338 } 1339 1340 /* 1341 * Add memory node to lgroup and remove lgroup from ones that need 1342 * to be updated 1343 */ 1344 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1345 my_lgrp->lgrp_mnodes |= mnodes_mask; 1346 my_lgrp->lgrp_nmnodes++; 1347 } 1348 klgrpset_del(changed, lgrpid); 1349 1350 /* 1351 * Update memory node information for all lgroups that changed and 1352 * contain new memory node as a resource 1353 */ 1354 if (count) 1355 (void) lgrp_mnode_update(changed, NULL); 1356 1357 if (drop_lock) 1358 mutex_exit(&cpu_lock); 1359 } 1360 1361 /* 1362 * Called to indicate that the lgroup associated with the platform 1363 * handle "hand" no longer contains given memory node 1364 * 1365 * LOCKING for this routine is a bit tricky. Usually it is called without 1366 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1367 * callers. During DR of the board containing the caged memory it may be called 1368 * with cpu_lock already held and CPUs paused. 1369 * 1370 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1371 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1372 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1373 * the same mnode back into the topology. See lgrp_mem_rename() and 1374 * lgrp_mem_init() for additional details. 1375 */ 1376 void 1377 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1378 { 1379 klgrpset_t changed; 1380 int count; 1381 int i; 1382 lgrp_t *my_lgrp; 1383 lgrp_id_t lgrpid; 1384 mnodeset_t mnodes_mask; 1385 boolean_t drop_lock = B_FALSE; 1386 boolean_t need_synch = B_FALSE; 1387 1388 /* 1389 * Grab CPU lock (if we haven't already) 1390 */ 1391 if (!MUTEX_HELD(&cpu_lock)) { 1392 mutex_enter(&cpu_lock); 1393 drop_lock = B_TRUE; 1394 } 1395 1396 /* 1397 * This routine may be called from a context where we already 1398 * hold cpu_lock and have already paused cpus. 1399 */ 1400 if (!cpus_paused()) 1401 need_synch = B_TRUE; 1402 1403 my_lgrp = lgrp_hand_to_lgrp(hand); 1404 1405 /* 1406 * The lgrp *must* be pre-existing 1407 */ 1408 ASSERT(my_lgrp != NULL); 1409 1410 /* 1411 * Delete memory node from lgroups which contain it 1412 */ 1413 mnodes_mask = ((mnodeset_t)1 << mnode); 1414 for (i = 0; i <= lgrp_alloc_max; i++) { 1415 lgrp_t *lgrp = lgrp_table[i]; 1416 /* 1417 * Skip any non-existent lgroups and any lgroups that don't 1418 * contain leaf lgroup of memory as a memory resource 1419 */ 1420 if (!LGRP_EXISTS(lgrp) || 1421 !(lgrp->lgrp_mnodes & mnodes_mask)) 1422 continue; 1423 1424 /* 1425 * Avoid removing the last mnode from the root in the DR 1426 * copy-rename case. See lgrp_mem_rename() for details. 1427 */ 1428 if (is_copy_rename && 1429 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1430 continue; 1431 1432 /* 1433 * Remove memory node from lgroup. 1434 */ 1435 lgrp->lgrp_mnodes &= ~mnodes_mask; 1436 lgrp->lgrp_nmnodes--; 1437 ASSERT(lgrp->lgrp_nmnodes >= 0); 1438 } 1439 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1440 1441 /* 1442 * Don't need to update lgroup topology if this lgroup still has memory. 1443 * 1444 * In the special case of DR copy-rename with the only mnode being 1445 * removed, the lgrp_mnodes for the root is always non-zero, but we 1446 * still need to update the lgroup topology. 1447 */ 1448 if ((my_lgrp->lgrp_nmnodes > 0) && 1449 !(is_copy_rename && 1450 (my_lgrp == lgrp_root) && 1451 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1452 if (drop_lock) 1453 mutex_exit(&cpu_lock); 1454 return; 1455 } 1456 1457 /* 1458 * This lgroup does not contain any memory now 1459 */ 1460 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1461 1462 /* 1463 * Remove this lgroup from lgroup topology if it does not contain any 1464 * resources now 1465 */ 1466 lgrpid = my_lgrp->lgrp_id; 1467 count = 0; 1468 klgrpset_clear(changed); 1469 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1470 /* 1471 * Delete lgroup when no more resources 1472 */ 1473 if (need_synch) 1474 pause_cpus(NULL); 1475 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1476 lgrp_alloc_max + 1, &changed); 1477 ASSERT(count > 0); 1478 if (need_synch) 1479 start_cpus(); 1480 } else { 1481 /* 1482 * Remove lgroup from memory resources of any lgroups that 1483 * contain it as such 1484 */ 1485 for (i = 0; i <= lgrp_alloc_max; i++) { 1486 lgrp_t *lgrp; 1487 1488 lgrp = lgrp_table[i]; 1489 if (!LGRP_EXISTS(lgrp) || 1490 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1491 lgrpid)) 1492 continue; 1493 1494 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1495 } 1496 } 1497 if (drop_lock) 1498 mutex_exit(&cpu_lock); 1499 } 1500 1501 /* 1502 * Return lgroup with given platform handle 1503 */ 1504 lgrp_t * 1505 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1506 { 1507 int i; 1508 lgrp_t *lgrp; 1509 1510 if (hand == LGRP_NULL_HANDLE) 1511 return (NULL); 1512 1513 for (i = 0; i <= lgrp_alloc_max; i++) { 1514 lgrp = lgrp_table[i]; 1515 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1516 return (lgrp); 1517 } 1518 return (NULL); 1519 } 1520 1521 /* 1522 * Return the home lgroup of the current thread. 1523 * We must do this with kernel preemption disabled, since we don't want our 1524 * thread to be re-homed while we're poking around with its lpl, and the lpl 1525 * should never be NULL. 1526 * 1527 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1528 * is enabled because of DR. Callers can use disable kernel preemption 1529 * around this call to guarantee that the lgroup will be valid beyond this 1530 * routine, since kernel preemption can be recursive. 1531 */ 1532 lgrp_t * 1533 lgrp_home_lgrp(void) 1534 { 1535 lgrp_t *lgrp; 1536 lpl_t *lpl; 1537 1538 kpreempt_disable(); 1539 1540 lpl = curthread->t_lpl; 1541 ASSERT(lpl != NULL); 1542 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1543 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1544 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1545 1546 kpreempt_enable(); 1547 1548 return (lgrp); 1549 } 1550 1551 /* 1552 * Return ID of home lgroup for given thread 1553 * (See comments for lgrp_home_lgrp() for special care and handling 1554 * instructions) 1555 */ 1556 lgrp_id_t 1557 lgrp_home_id(kthread_t *t) 1558 { 1559 lgrp_id_t lgrp; 1560 lpl_t *lpl; 1561 1562 ASSERT(t != NULL); 1563 /* 1564 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1565 * cannot since the HAT layer can call into this routine to 1566 * determine the locality for its data structures in the context 1567 * of a page fault. 1568 */ 1569 1570 kpreempt_disable(); 1571 1572 lpl = t->t_lpl; 1573 ASSERT(lpl != NULL); 1574 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1575 lgrp = lpl->lpl_lgrpid; 1576 1577 kpreempt_enable(); 1578 1579 return (lgrp); 1580 } 1581 1582 /* 1583 * Return lgroup containing the physical memory for the given page frame number 1584 */ 1585 lgrp_t * 1586 lgrp_pfn_to_lgrp(pfn_t pfn) 1587 { 1588 lgrp_handle_t hand; 1589 int i; 1590 lgrp_t *lgrp; 1591 1592 hand = lgrp_plat_pfn_to_hand(pfn); 1593 if (hand != LGRP_NULL_HANDLE) 1594 for (i = 0; i <= lgrp_alloc_max; i++) { 1595 lgrp = lgrp_table[i]; 1596 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1597 return (lgrp); 1598 } 1599 return (NULL); 1600 } 1601 1602 /* 1603 * Return lgroup containing the physical memory for the given page frame number 1604 */ 1605 lgrp_t * 1606 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1607 { 1608 lgrp_handle_t hand; 1609 int i; 1610 lgrp_t *lgrp; 1611 pfn_t pfn; 1612 1613 pfn = btop(physaddr); 1614 hand = lgrp_plat_pfn_to_hand(pfn); 1615 if (hand != LGRP_NULL_HANDLE) 1616 for (i = 0; i <= lgrp_alloc_max; i++) { 1617 lgrp = lgrp_table[i]; 1618 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1619 return (lgrp); 1620 } 1621 return (NULL); 1622 } 1623 1624 /* 1625 * Return the leaf lgroup containing the given CPU 1626 * 1627 * The caller needs to take precautions necessary to prevent 1628 * "cpu" from going away across a call to this function. 1629 * hint: kpreempt_disable()/kpreempt_enable() 1630 */ 1631 static lgrp_t * 1632 lgrp_cpu_to_lgrp(cpu_t *cpu) 1633 { 1634 return (cpu->cpu_lpl->lpl_lgrp); 1635 } 1636 1637 /* 1638 * Return the sum of the partition loads in an lgrp divided by 1639 * the number of CPUs in the lgrp. This is our best approximation 1640 * of an 'lgroup load average' for a useful per-lgroup kstat. 1641 */ 1642 static uint64_t 1643 lgrp_sum_loadavgs(lgrp_t *lgrp) 1644 { 1645 cpu_t *cpu; 1646 int ncpu; 1647 uint64_t loads = 0; 1648 1649 mutex_enter(&cpu_lock); 1650 1651 cpu = lgrp->lgrp_cpu; 1652 ncpu = lgrp->lgrp_cpucnt; 1653 1654 if (cpu == NULL || ncpu == 0) { 1655 mutex_exit(&cpu_lock); 1656 return (0ull); 1657 } 1658 1659 do { 1660 loads += cpu->cpu_lpl->lpl_loadavg; 1661 cpu = cpu->cpu_next_lgrp; 1662 } while (cpu != lgrp->lgrp_cpu); 1663 1664 mutex_exit(&cpu_lock); 1665 1666 return (loads / ncpu); 1667 } 1668 1669 void 1670 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1671 { 1672 struct lgrp_stats *pstats; 1673 1674 /* 1675 * Verify that the caller isn't trying to add to 1676 * a statistic for an lgroup that has gone away 1677 */ 1678 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1679 return; 1680 1681 pstats = &lgrp_stats[lgrpid]; 1682 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1683 } 1684 1685 int64_t 1686 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1687 { 1688 uint64_t val; 1689 struct lgrp_stats *pstats; 1690 1691 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1692 return ((int64_t)0); 1693 1694 pstats = &lgrp_stats[lgrpid]; 1695 LGRP_STAT_READ(pstats, stat, val); 1696 return (val); 1697 } 1698 1699 /* 1700 * Reset all kstats for lgrp specified by its lgrpid. 1701 */ 1702 static void 1703 lgrp_kstat_reset(lgrp_id_t lgrpid) 1704 { 1705 lgrp_stat_t stat; 1706 1707 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1708 return; 1709 1710 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1711 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1712 } 1713 } 1714 1715 /* 1716 * Collect all per-lgrp statistics for the lgrp associated with this 1717 * kstat, and store them in the ks_data array. 1718 * 1719 * The superuser can reset all the running counter statistics for an 1720 * lgrp by writing to any of the lgrp's stats. 1721 */ 1722 static int 1723 lgrp_kstat_extract(kstat_t *ksp, int rw) 1724 { 1725 lgrp_stat_t stat; 1726 struct kstat_named *ksd; 1727 lgrp_t *lgrp; 1728 lgrp_id_t lgrpid; 1729 1730 lgrp = (lgrp_t *)ksp->ks_private; 1731 1732 ksd = (struct kstat_named *)ksp->ks_data; 1733 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1734 1735 lgrpid = lgrp->lgrp_id; 1736 1737 if (lgrpid == LGRP_NONE) { 1738 /* 1739 * Return all zeroes as stats for freed lgrp. 1740 */ 1741 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1742 ksd[stat].value.i64 = 0; 1743 } 1744 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1745 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1746 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1747 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1748 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1749 } else if (rw != KSTAT_WRITE) { 1750 /* 1751 * Handle counter stats 1752 */ 1753 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1754 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1755 } 1756 1757 /* 1758 * Handle kernel data snapshot stats 1759 */ 1760 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1761 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1762 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1763 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1764 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1765 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1766 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1767 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1768 } else { 1769 lgrp_kstat_reset(lgrpid); 1770 } 1771 1772 return (0); 1773 } 1774 1775 int 1776 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1777 { 1778 cpu_t *cp; 1779 1780 mutex_enter(&cpu_lock); 1781 1782 if ((cp = cpu_get(id)) == NULL) { 1783 mutex_exit(&cpu_lock); 1784 return (EINVAL); 1785 } 1786 1787 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1788 mutex_exit(&cpu_lock); 1789 return (EINVAL); 1790 } 1791 1792 ASSERT(cp->cpu_lpl != NULL); 1793 1794 *lp = cp->cpu_lpl->lpl_lgrpid; 1795 1796 mutex_exit(&cpu_lock); 1797 1798 return (0); 1799 } 1800 1801 int 1802 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1803 { 1804 cpu_t *cp; 1805 1806 mutex_enter(&cpu_lock); 1807 1808 if ((cp = cpu_get(id)) == NULL) { 1809 mutex_exit(&cpu_lock); 1810 return (EINVAL); 1811 } 1812 1813 ASSERT(cp->cpu_lpl != NULL); 1814 1815 *lp = cp->cpu_lpl->lpl_loadavg; 1816 1817 mutex_exit(&cpu_lock); 1818 1819 return (0); 1820 } 1821 1822 void 1823 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime) 1824 { 1825 lgrp_t *lgrp; 1826 int i; 1827 1828 for (i = 0; i <= lgrp_alloc_max; i++) { 1829 lgrp = lgrp_table[i]; 1830 1831 if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime)) 1832 lgrp->lgrp_latency = (int)newtime; 1833 } 1834 } 1835 1836 /* 1837 * Add a resource named by lpl_leaf to rset of lpl_target 1838 * 1839 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1840 * resource. It is adjusted here, as this is presently the only place that we 1841 * can be certain a resource addition has succeeded. 1842 * 1843 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1844 * list in order until it reaches a NULL. (This list is required to be NULL 1845 * terminated, too). This is done so that we can mark start pos + 1, so that 1846 * each lpl is traversed sequentially, but in a different order. We hope this 1847 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1848 */ 1849 1850 void 1851 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1852 { 1853 int i; 1854 int entry_slot = 0; 1855 1856 /* return if leaf is already present */ 1857 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1858 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1859 return; 1860 } 1861 1862 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1863 lpl_leaf->lpl_lgrpid) { 1864 break; 1865 } 1866 } 1867 1868 /* insert leaf, update counts */ 1869 entry_slot = i; 1870 i = lpl_target->lpl_nrset++; 1871 if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 1872 panic("More leaf lgrps in system than are supported!\n"); 1873 } 1874 1875 /* 1876 * Start at the end of the rset array and work backwards towards the 1877 * slot into which the new lpl will be inserted. This effectively 1878 * preserves the current ordering by scooting everybody over one entry, 1879 * and placing the new entry into the space created. 1880 */ 1881 1882 while (i-- > entry_slot) { 1883 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1884 } 1885 1886 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1887 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1888 } 1889 1890 /* 1891 * Update each of lpl_parent's children with a proper hint and 1892 * a reference to their parent. 1893 * The lgrp topology is used as the reference since it is fully 1894 * consistent and correct at this point. 1895 * 1896 * Each child's hint will reference an element in lpl_parent's 1897 * rset that designates where the child should start searching 1898 * for CPU resources. The hint selected is the highest order leaf present 1899 * in the child's lineage. 1900 * 1901 * This should be called after any potential change in lpl_parent's 1902 * rset. 1903 */ 1904 static void 1905 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1906 { 1907 klgrpset_t children, leaves; 1908 lpl_t *lpl; 1909 int hint; 1910 int i, j; 1911 1912 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1913 if (klgrpset_isempty(children)) 1914 return; /* nothing to do */ 1915 1916 for (i = 0; i <= lgrp_alloc_max; i++) { 1917 if (klgrpset_ismember(children, i)) { 1918 1919 /* 1920 * Given the set of leaves in this child's lineage, 1921 * find the highest order leaf present in the parent's 1922 * rset. Select this as the hint for the child. 1923 */ 1924 leaves = lgrp_table[i]->lgrp_leaves; 1925 hint = 0; 1926 for (j = 0; j < lpl_parent->lpl_nrset; j++) { 1927 lpl = lpl_parent->lpl_rset[j]; 1928 if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 1929 hint = j; 1930 } 1931 cp->cp_lgrploads[i].lpl_hint = hint; 1932 1933 /* 1934 * (Re)set the parent. It may be incorrect if 1935 * lpl_parent is new in the topology. 1936 */ 1937 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1938 } 1939 } 1940 } 1941 1942 /* 1943 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1944 * 1945 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1946 * resource. The values are adjusted here, as this is the only place that we can 1947 * be certain a resource was successfully deleted. 1948 */ 1949 void 1950 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1951 { 1952 int i; 1953 1954 /* find leaf in intermediate node */ 1955 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1956 if (lpl_target->lpl_rset[i] == lpl_leaf) 1957 break; 1958 } 1959 1960 /* return if leaf not found */ 1961 if (lpl_target->lpl_rset[i] != lpl_leaf) 1962 return; 1963 1964 /* prune leaf, compress array */ 1965 ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 1966 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1967 lpl_target->lpl_ncpu--; 1968 do { 1969 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1970 } while (i++ < lpl_target->lpl_nrset); 1971 } 1972 1973 /* 1974 * Check to see if the resource set of the target lpl contains the 1975 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1976 */ 1977 1978 int 1979 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1980 { 1981 int i; 1982 1983 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1984 if (lpl_target->lpl_rset[i] == lpl_leaf) 1985 return (1); 1986 } 1987 1988 return (0); 1989 } 1990 1991 /* 1992 * Called when we change cpu lpl membership. This increments or decrements the 1993 * per-cpu counter in every lpl in which our leaf appears. 1994 */ 1995 void 1996 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 1997 { 1998 cpupart_t *cpupart; 1999 lgrp_t *lgrp_leaf; 2000 lgrp_t *lgrp_cur; 2001 lpl_t *lpl_leaf; 2002 lpl_t *lpl_cur; 2003 int i; 2004 2005 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 2006 2007 cpupart = cp->cpu_part; 2008 lpl_leaf = cp->cpu_lpl; 2009 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 2010 2011 for (i = 0; i <= lgrp_alloc_max; i++) { 2012 lgrp_cur = lgrp_table[i]; 2013 2014 /* 2015 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 2016 * for the cpu in question, or if the current lgrp and leaf 2017 * don't share the same resources. 2018 */ 2019 2020 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 2021 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 2022 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 2023 continue; 2024 2025 2026 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2027 2028 if (lpl_cur->lpl_nrset > 0) { 2029 if (act == LPL_INCREMENT) { 2030 lpl_cur->lpl_ncpu++; 2031 } else if (act == LPL_DECREMENT) { 2032 lpl_cur->lpl_ncpu--; 2033 } 2034 } 2035 } 2036 } 2037 2038 /* 2039 * Initialize lpl with given resources and specified lgrp 2040 */ 2041 2042 void 2043 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 2044 { 2045 lpl->lpl_lgrpid = lgrp->lgrp_id; 2046 lpl->lpl_loadavg = 0; 2047 if (lpl == lpl_leaf) 2048 lpl->lpl_ncpu = 1; 2049 else 2050 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2051 lpl->lpl_nrset = 1; 2052 lpl->lpl_rset[0] = lpl_leaf; 2053 lpl->lpl_lgrp = lgrp; 2054 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2055 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2056 } 2057 2058 /* 2059 * Clear an unused lpl 2060 */ 2061 2062 void 2063 lpl_clear(lpl_t *lpl) 2064 { 2065 lgrp_id_t lid; 2066 2067 /* save lid for debugging purposes */ 2068 lid = lpl->lpl_lgrpid; 2069 bzero(lpl, sizeof (lpl_t)); 2070 lpl->lpl_lgrpid = lid; 2071 } 2072 2073 /* 2074 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2075 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2076 * make full use of all of the lgroup topology, but this checks to make sure 2077 * that for the parts that it does use, it has correctly understood the 2078 * relationships that exist. This function returns 2079 * 0 if the topology is correct, and a non-zero error code, for non-debug 2080 * kernels if incorrect. Asserts are spread throughout the code to aid in 2081 * debugging on a DEBUG kernel. 2082 */ 2083 int 2084 lpl_topo_verify(cpupart_t *cpupart) 2085 { 2086 lgrp_t *lgrp; 2087 lpl_t *lpl; 2088 klgrpset_t rset; 2089 klgrpset_t cset; 2090 cpu_t *cpu; 2091 cpu_t *cp_start; 2092 int i; 2093 int j; 2094 int sum; 2095 2096 /* topology can't be incorrect if it doesn't exist */ 2097 if (!lgrp_topo_initialized || !lgrp_initialized) 2098 return (LPL_TOPO_CORRECT); 2099 2100 ASSERT(cpupart != NULL); 2101 2102 for (i = 0; i <= lgrp_alloc_max; i++) { 2103 lgrp = lgrp_table[i]; 2104 lpl = NULL; 2105 /* make sure lpls are allocated */ 2106 ASSERT(cpupart->cp_lgrploads); 2107 if (!cpupart->cp_lgrploads) 2108 return (LPL_TOPO_PART_HAS_NO_LPL); 2109 2110 lpl = &cpupart->cp_lgrploads[i]; 2111 /* make sure our index is good */ 2112 ASSERT(i < cpupart->cp_nlgrploads); 2113 2114 /* if lgroup doesn't exist, make sure lpl is empty */ 2115 if (!LGRP_EXISTS(lgrp)) { 2116 ASSERT(lpl->lpl_ncpu == 0); 2117 if (lpl->lpl_ncpu > 0) { 2118 return (LPL_TOPO_CPUS_NOT_EMPTY); 2119 } else { 2120 continue; 2121 } 2122 } 2123 2124 /* verify that lgroup and lpl are identically numbered */ 2125 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2126 2127 /* if lgroup isn't in our partition, make sure lpl is empty */ 2128 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2129 cpupart->cp_lgrpset)) { 2130 ASSERT(lpl->lpl_ncpu == 0); 2131 if (lpl->lpl_ncpu > 0) { 2132 return (LPL_TOPO_CPUS_NOT_EMPTY); 2133 } 2134 /* 2135 * lpl is empty, and lgroup isn't in partition. verify 2136 * that lpl doesn't show up in anyone else's rsets (in 2137 * this partition, anyway) 2138 */ 2139 2140 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2141 lpl_t *i_lpl; /* lpl we're iterating over */ 2142 2143 i_lpl = &cpupart->cp_lgrploads[j]; 2144 2145 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2146 if (lpl_rset_contains(i_lpl, lpl)) { 2147 return (LPL_TOPO_LPL_ORPHANED); 2148 } 2149 } 2150 /* lgroup is empty, and everything is ok. continue */ 2151 continue; 2152 } 2153 2154 2155 /* lgroup is in this partition, now check it against lpl */ 2156 2157 /* do both have matching lgrps? */ 2158 ASSERT(lgrp == lpl->lpl_lgrp); 2159 if (lgrp != lpl->lpl_lgrp) { 2160 return (LPL_TOPO_LGRP_MISMATCH); 2161 } 2162 2163 /* do the parent lgroups exist and do they match? */ 2164 if (lgrp->lgrp_parent) { 2165 ASSERT(lpl->lpl_parent); 2166 ASSERT(lgrp->lgrp_parent->lgrp_id == 2167 lpl->lpl_parent->lpl_lgrpid); 2168 2169 if (!lpl->lpl_parent) { 2170 return (LPL_TOPO_MISSING_PARENT); 2171 } else if (lgrp->lgrp_parent->lgrp_id != 2172 lpl->lpl_parent->lpl_lgrpid) { 2173 return (LPL_TOPO_PARENT_MISMATCH); 2174 } 2175 } 2176 2177 /* only leaf lgroups keep a cpucnt, only check leaves */ 2178 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2179 2180 /* verify that lgrp is also a leaf */ 2181 ASSERT((lgrp->lgrp_childcnt == 0) && 2182 (klgrpset_ismember(lgrp->lgrp_leaves, 2183 lpl->lpl_lgrpid))); 2184 2185 if ((lgrp->lgrp_childcnt > 0) || 2186 (!klgrpset_ismember(lgrp->lgrp_leaves, 2187 lpl->lpl_lgrpid))) { 2188 return (LPL_TOPO_LGRP_NOT_LEAF); 2189 } 2190 2191 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2192 (lpl->lpl_ncpu > 0)); 2193 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2194 (lpl->lpl_ncpu <= 0)) { 2195 return (LPL_TOPO_BAD_CPUCNT); 2196 } 2197 2198 /* 2199 * Check that lpl_ncpu also matches the number of 2200 * cpus in the lpl's linked list. This only exists in 2201 * leaves, but they should always match. 2202 */ 2203 j = 0; 2204 cpu = cp_start = lpl->lpl_cpus; 2205 while (cpu != NULL) { 2206 j++; 2207 2208 /* check to make sure cpu's lpl is leaf lpl */ 2209 ASSERT(cpu->cpu_lpl == lpl); 2210 if (cpu->cpu_lpl != lpl) { 2211 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2212 } 2213 2214 /* check next cpu */ 2215 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2216 continue; 2217 } else { 2218 cpu = NULL; 2219 } 2220 } 2221 2222 ASSERT(j == lpl->lpl_ncpu); 2223 if (j != lpl->lpl_ncpu) { 2224 return (LPL_TOPO_LPL_BAD_NCPU); 2225 } 2226 2227 /* 2228 * Also, check that leaf lpl is contained in all 2229 * intermediate lpls that name the leaf as a descendant 2230 */ 2231 2232 for (j = 0; j <= lgrp_alloc_max; j++) { 2233 klgrpset_t intersect; 2234 lgrp_t *lgrp_cand; 2235 lpl_t *lpl_cand; 2236 2237 lgrp_cand = lgrp_table[j]; 2238 intersect = klgrpset_intersects( 2239 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2240 cpupart->cp_lgrpset); 2241 2242 if (!LGRP_EXISTS(lgrp_cand) || 2243 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2244 cpupart->cp_lgrpset) || 2245 (intersect == 0)) 2246 continue; 2247 2248 lpl_cand = 2249 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2250 2251 if (klgrpset_ismember(intersect, 2252 lgrp->lgrp_id)) { 2253 ASSERT(lpl_rset_contains(lpl_cand, 2254 lpl)); 2255 2256 if (!lpl_rset_contains(lpl_cand, lpl)) { 2257 return (LPL_TOPO_RSET_MSSNG_LF); 2258 } 2259 } 2260 } 2261 2262 } else { /* non-leaf specific checks */ 2263 2264 /* 2265 * Non-leaf lpls should have lpl_cpus == NULL 2266 * verify that this is so 2267 */ 2268 ASSERT(lpl->lpl_cpus == NULL); 2269 if (lpl->lpl_cpus != NULL) { 2270 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2271 } 2272 2273 /* 2274 * verify that the sum of the cpus in the leaf resources 2275 * is equal to the total ncpu in the intermediate 2276 */ 2277 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2278 sum += lpl->lpl_rset[j]->lpl_ncpu; 2279 } 2280 2281 ASSERT(sum == lpl->lpl_ncpu); 2282 if (sum != lpl->lpl_ncpu) { 2283 return (LPL_TOPO_LPL_BAD_NCPU); 2284 } 2285 } 2286 2287 /* 2288 * check on lpl_hint. Don't check root, since it has no parent. 2289 */ 2290 if (lpl->lpl_parent != NULL) { 2291 int hint; 2292 lpl_t *hint_lpl; 2293 2294 /* make sure hint is within limits of nrset */ 2295 hint = lpl->lpl_hint; 2296 ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 2297 if (lpl->lpl_parent->lpl_nrset < hint) { 2298 return (LPL_TOPO_BOGUS_HINT); 2299 } 2300 2301 /* make sure hint points to valid lpl */ 2302 hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 2303 ASSERT(hint_lpl->lpl_ncpu > 0); 2304 if (hint_lpl->lpl_ncpu <= 0) { 2305 return (LPL_TOPO_BOGUS_HINT); 2306 } 2307 } 2308 2309 /* 2310 * Check the rset of the lpl in question. Make sure that each 2311 * rset contains a subset of the resources in 2312 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2313 * sure that each rset doesn't include resources that are 2314 * outside of that set. (Which would be resources somehow not 2315 * accounted for). 2316 */ 2317 2318 klgrpset_clear(rset); 2319 for (j = 0; j < lpl->lpl_nrset; j++) { 2320 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2321 } 2322 klgrpset_copy(cset, rset); 2323 /* make sure lpl rset matches lgrp rset */ 2324 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2325 /* make sure rset is contained with in partition, too */ 2326 klgrpset_diff(cset, cpupart->cp_lgrpset); 2327 2328 ASSERT(klgrpset_isempty(rset) && 2329 klgrpset_isempty(cset)); 2330 if (!klgrpset_isempty(rset) || 2331 !klgrpset_isempty(cset)) { 2332 return (LPL_TOPO_RSET_MISMATCH); 2333 } 2334 2335 /* 2336 * check to make sure lpl_nrset matches the number of rsets 2337 * contained in the lpl 2338 */ 2339 2340 for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 2341 j++); 2342 2343 ASSERT(j == lpl->lpl_nrset); 2344 if (j != lpl->lpl_nrset) { 2345 return (LPL_TOPO_BAD_RSETCNT); 2346 } 2347 2348 } 2349 return (LPL_TOPO_CORRECT); 2350 } 2351 2352 /* 2353 * Flatten lpl topology to given number of levels. This is presently only 2354 * implemented for a flatten to 2 levels, which will prune out the intermediates 2355 * and home the leaf lpls to the root lpl. 2356 */ 2357 int 2358 lpl_topo_flatten(int levels) 2359 { 2360 int i; 2361 uint_t sum; 2362 lgrp_t *lgrp_cur; 2363 lpl_t *lpl_cur; 2364 lpl_t *lpl_root; 2365 cpupart_t *cp; 2366 2367 if (levels != 2) 2368 return (0); 2369 2370 /* called w/ cpus paused - grab no locks! */ 2371 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2372 !lgrp_initialized); 2373 2374 cp = cp_list_head; 2375 do { 2376 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2377 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2378 2379 for (i = 0; i <= lgrp_alloc_max; i++) { 2380 lgrp_cur = lgrp_table[i]; 2381 lpl_cur = &cp->cp_lgrploads[i]; 2382 2383 if ((lgrp_cur == lgrp_root) || 2384 (!LGRP_EXISTS(lgrp_cur) && 2385 (lpl_cur->lpl_ncpu == 0))) 2386 continue; 2387 2388 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2389 /* 2390 * this should be a deleted intermediate, so 2391 * clear it 2392 */ 2393 lpl_clear(lpl_cur); 2394 } else if ((lpl_cur->lpl_nrset == 1) && 2395 (lpl_cur->lpl_rset[0] == lpl_cur) && 2396 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2397 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2398 /* 2399 * this is a leaf whose parent was deleted, or 2400 * whose parent had their lgrp deleted. (And 2401 * whose parent will soon be deleted). Point 2402 * this guy back to the root lpl. 2403 */ 2404 lpl_cur->lpl_parent = lpl_root; 2405 lpl_rset_add(lpl_root, lpl_cur); 2406 } 2407 2408 } 2409 2410 /* 2411 * Now that we're done, make sure the count on the root lpl is 2412 * correct, and update the hints of the children for the sake of 2413 * thoroughness 2414 */ 2415 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2416 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2417 } 2418 lpl_root->lpl_ncpu = sum; 2419 lpl_child_update(lpl_root, cp); 2420 2421 cp = cp->cp_next; 2422 } while (cp != cp_list_head); 2423 2424 return (levels); 2425 } 2426 2427 /* 2428 * Insert a lpl into the resource hierarchy and create any additional lpls that 2429 * are necessary to represent the varying states of locality for the cpu 2430 * resoruces newly added to the partition. 2431 * 2432 * This routine is clever enough that it can correctly add resources from the 2433 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2434 * those for which the lpl is a leaf as opposed to simply a named equally local 2435 * resource). The one special case that needs additional processing is when a 2436 * new intermediate lpl is introduced. Since the main loop only traverses 2437 * looking to add the leaf resource where it does not yet exist, additional work 2438 * is necessary to add other leaf resources that may need to exist in the newly 2439 * created intermediate. This is performed by the second inner loop, and is 2440 * only done when the check for more than one overlapping resource succeeds. 2441 */ 2442 2443 void 2444 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2445 { 2446 int i; 2447 int j; 2448 int hint; 2449 int rset_num_intersect; 2450 lgrp_t *lgrp_cur; 2451 lpl_t *lpl_cur; 2452 lpl_t *lpl_parent; 2453 lgrp_id_t parent_id; 2454 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2455 2456 for (i = 0; i <= lgrp_alloc_max; i++) { 2457 lgrp_cur = lgrp_table[i]; 2458 2459 /* 2460 * Don't insert if the lgrp isn't there, if the leaf isn't 2461 * contained within the current lgrp, or if the current lgrp has 2462 * no leaves in this partition 2463 */ 2464 2465 if (!LGRP_EXISTS(lgrp_cur) || 2466 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2467 lpl_leaf->lpl_lgrpid) || 2468 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2469 cpupart->cp_lgrpset)) 2470 continue; 2471 2472 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2473 if (lgrp_cur->lgrp_parent != NULL) { 2474 /* if lgrp has a parent, assign it properly */ 2475 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2476 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2477 } else { 2478 /* if not, make sure parent ptr gets set to null */ 2479 lpl_parent = NULL; 2480 } 2481 2482 if (lpl_cur == lpl_leaf) { 2483 /* 2484 * Almost all leaf state was initialized elsewhere. The 2485 * only thing left to do is to set the parent. 2486 */ 2487 lpl_cur->lpl_parent = lpl_parent; 2488 continue; 2489 } 2490 2491 /* 2492 * Initialize intermediate lpl 2493 * Save this lpl's hint though. Since we're changing this 2494 * lpl's resources, we need to update the hint in this lpl's 2495 * children, but the hint in this lpl is unaffected and 2496 * should be preserved. 2497 */ 2498 hint = lpl_cur->lpl_hint; 2499 2500 lpl_clear(lpl_cur); 2501 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2502 2503 lpl_cur->lpl_hint = hint; 2504 lpl_cur->lpl_parent = lpl_parent; 2505 2506 /* does new lpl need to be populated with other resources? */ 2507 rset_intersect = 2508 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2509 cpupart->cp_lgrpset); 2510 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2511 2512 if (rset_num_intersect > 1) { 2513 /* 2514 * If so, figure out what lpls have resources that 2515 * intersect this one, and add them. 2516 */ 2517 for (j = 0; j <= lgrp_alloc_max; j++) { 2518 lgrp_t *lgrp_cand; /* candidate lgrp */ 2519 lpl_t *lpl_cand; /* candidate lpl */ 2520 2521 lgrp_cand = lgrp_table[j]; 2522 if (!LGRP_EXISTS(lgrp_cand) || 2523 !klgrpset_ismember(rset_intersect, 2524 lgrp_cand->lgrp_id)) 2525 continue; 2526 lpl_cand = 2527 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2528 lpl_rset_add(lpl_cur, lpl_cand); 2529 } 2530 } 2531 /* 2532 * This lpl's rset has changed. Update the hint in it's 2533 * children. 2534 */ 2535 lpl_child_update(lpl_cur, cpupart); 2536 } 2537 } 2538 2539 /* 2540 * remove a lpl from the hierarchy of resources, clearing its state when 2541 * finished. If the lpls at the intermediate levels of the hierarchy have no 2542 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2543 * delete them as well. 2544 */ 2545 2546 void 2547 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2548 { 2549 int i; 2550 lgrp_t *lgrp_cur; 2551 lpl_t *lpl_cur; 2552 klgrpset_t leaf_intersect; /* intersection of leaves */ 2553 2554 for (i = 0; i <= lgrp_alloc_max; i++) { 2555 lgrp_cur = lgrp_table[i]; 2556 2557 /* 2558 * Don't attempt to remove from lgrps that aren't there, that 2559 * don't contain our leaf, or from the leaf itself. (We do that 2560 * later) 2561 */ 2562 2563 if (!LGRP_EXISTS(lgrp_cur)) 2564 continue; 2565 2566 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2567 2568 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2569 lpl_leaf->lpl_lgrpid) || 2570 (lpl_cur == lpl_leaf)) { 2571 continue; 2572 } 2573 2574 /* 2575 * This is a slightly sleazy simplification in that we have 2576 * already marked the cp_lgrpset as no longer containing the 2577 * leaf we've deleted. Any lpls that pass the above checks 2578 * based upon lgrp membership but not necessarily cpu-part 2579 * membership also get cleared by the checks below. Currently 2580 * this is harmless, as the lpls should be empty anyway. 2581 * 2582 * In particular, we want to preserve lpls that have additional 2583 * leaf resources, even though we don't yet have a processor 2584 * architecture that represents resources this way. 2585 */ 2586 2587 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2588 cpupart->cp_lgrpset); 2589 2590 lpl_rset_del(lpl_cur, lpl_leaf); 2591 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2592 lpl_clear(lpl_cur); 2593 } else { 2594 /* 2595 * Update this lpl's children 2596 */ 2597 lpl_child_update(lpl_cur, cpupart); 2598 } 2599 } 2600 lpl_clear(lpl_leaf); 2601 } 2602 2603 /* 2604 * add a cpu to a partition in terms of lgrp load avg bookeeping 2605 * 2606 * The lpl (cpu partition load average information) is now arranged in a 2607 * hierarchical fashion whereby resources that are closest, ie. most local, to 2608 * the cpu in question are considered to be leaves in a tree of resources. 2609 * There are two general cases for cpu additon: 2610 * 2611 * 1. A lpl structure that contains resources already in the hierarchy tree. 2612 * In this case, all of the associated lpl relationships have been defined, and 2613 * all that is necessary is that we link the new cpu into the per-lpl list of 2614 * cpus, and increment the ncpu count of all places where this cpu resource will 2615 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2616 * pushing is accomplished by this routine. 2617 * 2618 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2619 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2620 * construct the hierarchy of state necessary to name it's more distant 2621 * resources, if they should exist. The leaf structure is initialized by this 2622 * routine, as is the cpu-partition state for the lgrp membership. This routine 2623 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2624 * and builds all of the "ancestoral" state necessary to identify resources at 2625 * differing levels of locality. 2626 */ 2627 void 2628 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2629 { 2630 cpupart_t *cpupart; 2631 lgrp_t *lgrp_leaf; 2632 lpl_t *lpl_leaf; 2633 2634 /* called sometimes w/ cpus paused - grab no locks */ 2635 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2636 2637 cpupart = cp->cpu_part; 2638 lgrp_leaf = lgrp_table[lgrpid]; 2639 2640 /* don't add non-existent lgrp */ 2641 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2642 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2643 cp->cpu_lpl = lpl_leaf; 2644 2645 /* only leaf lpls contain cpus */ 2646 2647 if (lpl_leaf->lpl_ncpu++ == 0) { 2648 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2649 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2650 lpl_leaf_insert(lpl_leaf, cpupart); 2651 } else { 2652 /* 2653 * the lpl should already exist in the parent, so just update 2654 * the count of available CPUs 2655 */ 2656 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2657 } 2658 2659 /* link cpu into list of cpus in lpl */ 2660 2661 if (lpl_leaf->lpl_cpus) { 2662 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2663 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2664 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2665 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2666 } else { 2667 /* 2668 * We increment ncpu immediately after we create a new leaf 2669 * lpl, so assert that ncpu == 1 for the case where we don't 2670 * have any cpu pointers yet. 2671 */ 2672 ASSERT(lpl_leaf->lpl_ncpu == 1); 2673 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2674 } 2675 2676 } 2677 2678 2679 /* 2680 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2681 * 2682 * The lpl (cpu partition load average information) is now arranged in a 2683 * hierarchical fashion whereby resources that are closest, ie. most local, to 2684 * the cpu in question are considered to be leaves in a tree of resources. 2685 * There are two removal cases in question: 2686 * 2687 * 1. Removal of the resource in the leaf leaves other resources remaining in 2688 * that leaf. (Another cpu still exists at this level of locality). In this 2689 * case, the count of available cpus is decremented in all assocated lpls by 2690 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2691 * from the per-cpu lpl list. 2692 * 2693 * 2. Removal of the resource results in the lpl containing no resources. (It's 2694 * empty) In this case, all of what has occurred for the first step must take 2695 * place; however, additionally we must remove the lpl structure itself, prune 2696 * out any stranded lpls that do not directly name a leaf resource, and mark the 2697 * cpu partition in question as no longer containing resources from the lgrp of 2698 * the lpl that has been delted. Cpu-partition changes are handled by this 2699 * method, but the lpl_leaf_remove function deals with the details of pruning 2700 * out the empty lpl and any of its orphaned direct ancestors. 2701 */ 2702 void 2703 lgrp_part_del_cpu(cpu_t *cp) 2704 { 2705 lpl_t *lpl; 2706 lpl_t *leaf_lpl; 2707 lgrp_t *lgrp_leaf; 2708 2709 /* called sometimes w/ cpus paused - grab no locks */ 2710 2711 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2712 2713 lpl = leaf_lpl = cp->cpu_lpl; 2714 lgrp_leaf = leaf_lpl->lpl_lgrp; 2715 2716 /* don't delete a leaf that isn't there */ 2717 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2718 2719 /* no double-deletes */ 2720 ASSERT(lpl->lpl_ncpu); 2721 if (--lpl->lpl_ncpu == 0) { 2722 /* 2723 * This was the last cpu in this lgroup for this partition, 2724 * clear its bit in the partition's lgroup bitmask 2725 */ 2726 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2727 2728 /* eliminate remaning lpl link pointers in cpu, lpl */ 2729 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2730 2731 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2732 } else { 2733 2734 /* unlink cpu from lists of cpus in lpl */ 2735 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2736 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2737 if (lpl->lpl_cpus == cp) { 2738 lpl->lpl_cpus = cp->cpu_next_lpl; 2739 } 2740 2741 /* 2742 * Update the cpu count in the lpls associated with parent 2743 * lgroups. 2744 */ 2745 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2746 2747 } 2748 /* clear cpu's lpl ptr when we're all done */ 2749 cp->cpu_lpl = NULL; 2750 } 2751 2752 /* 2753 * Recompute load average for the specified partition/lgrp fragment. 2754 * 2755 * We rely on the fact that this routine is called from the clock thread 2756 * at a point before the clock thread can block (i.e. before its first 2757 * lock request). Since the clock thread can not be preempted (since it 2758 * runs at highest priority), we know that cpu partitions can not change 2759 * (since doing so would require either the repartition requester or the 2760 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2761 * without grabbing cpu_lock. 2762 */ 2763 void 2764 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2765 { 2766 uint_t ncpu; 2767 int64_t old, new, f; 2768 2769 /* 2770 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2771 */ 2772 static short expval[] = { 2773 0, 3196, 1618, 1083, 2774 814, 652, 543, 466, 2775 408, 363, 326, 297, 2776 272, 251, 233, 218, 2777 204, 192, 181, 172, 2778 163, 155, 148, 142, 2779 136, 130, 125, 121, 2780 116, 112, 109, 105 2781 }; 2782 2783 /* ASSERT (called from clock level) */ 2784 2785 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2786 ((ncpu = lpl->lpl_ncpu) == 0)) { 2787 return; 2788 } 2789 2790 for (;;) { 2791 2792 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2793 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2794 else 2795 f = expval[ncpu]; 2796 2797 /* 2798 * Modify the load average atomically to avoid losing 2799 * anticipatory load updates (see lgrp_move_thread()). 2800 */ 2801 if (ageflag) { 2802 /* 2803 * We're supposed to both update and age the load. 2804 * This happens 10 times/sec. per cpu. We do a 2805 * little hoop-jumping to avoid integer overflow. 2806 */ 2807 int64_t q, r; 2808 2809 do { 2810 old = new = lpl->lpl_loadavg; 2811 q = (old >> 16) << 7; 2812 r = (old & 0xffff) << 7; 2813 new += ((long long)(nrcpus - q) * f - 2814 ((r * f) >> 16)) >> 7; 2815 2816 /* 2817 * Check for overflow 2818 */ 2819 if (new > LGRP_LOADAVG_MAX) 2820 new = LGRP_LOADAVG_MAX; 2821 else if (new < 0) 2822 new = 0; 2823 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2824 new) != old); 2825 } else { 2826 /* 2827 * We're supposed to update the load, but not age it. 2828 * This option is used to update the load (which either 2829 * has already been aged in this 1/10 sec. interval or 2830 * soon will be) to account for a remotely executing 2831 * thread. 2832 */ 2833 do { 2834 old = new = lpl->lpl_loadavg; 2835 new += f; 2836 /* 2837 * Check for overflow 2838 * Underflow not possible here 2839 */ 2840 if (new < old) 2841 new = LGRP_LOADAVG_MAX; 2842 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2843 new) != old); 2844 } 2845 2846 /* 2847 * Do the same for this lpl's parent 2848 */ 2849 if ((lpl = lpl->lpl_parent) == NULL) 2850 break; 2851 ncpu = lpl->lpl_ncpu; 2852 } 2853 } 2854 2855 /* 2856 * Initialize lpl topology in the target based on topology currently present in 2857 * lpl_bootstrap. 2858 * 2859 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2860 * initialize cp_default list of lpls. Up to this point all topology operations 2861 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2862 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2863 * `target' points to the list of lpls in cp_default and `size' is the size of 2864 * this list. 2865 * 2866 * This function walks the lpl topology in lpl_bootstrap and does for things: 2867 * 2868 * 1) Copies all fields from lpl_bootstrap to the target. 2869 * 2870 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2871 * 2872 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2873 * instead of lpl_bootstrap. 2874 * 2875 * 4) Updates pointers in the resource list of the target to point to the lpls 2876 * in the target list instead of lpl_bootstrap. 2877 * 2878 * After lpl_topo_bootstrap() completes, target contains the same information 2879 * that would be present there if it were used during boot instead of 2880 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2881 * and it is bzeroed. 2882 */ 2883 void 2884 lpl_topo_bootstrap(lpl_t *target, int size) 2885 { 2886 lpl_t *lpl = lpl_bootstrap; 2887 lpl_t *target_lpl = target; 2888 int howmany; 2889 int id; 2890 int i; 2891 2892 /* 2893 * The only target that should be passed here is cp_default lpl list. 2894 */ 2895 ASSERT(target == cp_default.cp_lgrploads); 2896 ASSERT(size == cp_default.cp_nlgrploads); 2897 ASSERT(!lgrp_topo_initialized); 2898 ASSERT(ncpus == 1); 2899 2900 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2901 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2902 /* 2903 * Copy all fields from lpl. 2904 */ 2905 2906 *target_lpl = *lpl; 2907 2908 /* 2909 * Substitute CPU0 lpl pointer with one relative to target. 2910 */ 2911 if (lpl->lpl_cpus == CPU) { 2912 ASSERT(CPU->cpu_lpl == lpl); 2913 CPU->cpu_lpl = target_lpl; 2914 } 2915 2916 /* 2917 * Substitute parent information with parent relative to target. 2918 */ 2919 if (lpl->lpl_parent != NULL) 2920 target_lpl->lpl_parent = (lpl_t *) 2921 (((uintptr_t)lpl->lpl_parent - 2922 (uintptr_t)lpl_bootstrap) + 2923 (uintptr_t)target); 2924 2925 /* 2926 * Walk over resource set substituting pointers relative to 2927 * lpl_bootstrap to pointers relative to target. 2928 */ 2929 ASSERT(lpl->lpl_nrset <= 1); 2930 2931 for (id = 0; id < lpl->lpl_nrset; id++) { 2932 if (lpl->lpl_rset[id] != NULL) { 2933 target_lpl->lpl_rset[id] = 2934 (lpl_t *) 2935 (((uintptr_t)lpl->lpl_rset[id] - 2936 (uintptr_t)lpl_bootstrap) + 2937 (uintptr_t)target); 2938 } 2939 } 2940 } 2941 2942 /* 2943 * Topology information in lpl_bootstrap is no longer needed. 2944 */ 2945 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2946 } 2947 2948 /* the maximum effect that a single thread can have on it's lgroup's load */ 2949 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 2950 ((lgrp_loadavg_max_effect) / (ncpu)) 2951 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 2952 2953 /* 2954 * If the lowest load among the lgroups a process' threads are currently 2955 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2956 * expanding the process to a new lgroup. 2957 */ 2958 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2959 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2960 2961 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2962 ((lgrp_expand_proc_thresh) / (ncpu)) 2963 2964 /* 2965 * A process will be expanded to a new lgroup only if the difference between 2966 * the lowest load on the lgroups the process' thread's are currently spread 2967 * across and the lowest load on the other lgroups in the process' partition 2968 * is greater than lgrp_expand_proc_diff. 2969 */ 2970 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2971 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2972 2973 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2974 ((lgrp_expand_proc_diff) / (ncpu)) 2975 2976 /* 2977 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2978 * be present due to impreciseness of the load average decay algorithm. 2979 * 2980 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2981 * tolerance is scaled by the number of cpus in the lgroup just like 2982 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2983 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2984 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2985 */ 2986 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2987 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2988 ((lgrp_loadavg_tolerance) / ncpu) 2989 2990 /* 2991 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2992 * average is above this threshold 2993 */ 2994 uint32_t lgrp_load_thresh = UINT32_MAX; 2995 2996 /* 2997 * lgrp_choose() will try to skip any lgroups with less memory 2998 * than this free when choosing a home lgroup 2999 */ 3000 pgcnt_t lgrp_mem_free_thresh = 0; 3001 3002 /* 3003 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 3004 * one based on one of the following policies: 3005 * - Random selection 3006 * - Pseudo round robin placement 3007 * - Longest time since a thread was last placed 3008 */ 3009 #define LGRP_CHOOSE_RANDOM 1 3010 #define LGRP_CHOOSE_RR 2 3011 #define LGRP_CHOOSE_TIME 3 3012 3013 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 3014 3015 /* 3016 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 3017 * be bound to a CPU or processor set. 3018 * 3019 * Arguments: 3020 * t The thread 3021 * cpupart The partition the thread belongs to. 3022 * 3023 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 3024 * disabled, or thread_lock held (at splhigh) to protect against the CPU 3025 * partitions changing out from under us and assumes that given thread is 3026 * protected. Also, called sometimes w/ cpus paused or kernel preemption 3027 * disabled, so don't grab any locks because we should never block under 3028 * those conditions. 3029 */ 3030 lpl_t * 3031 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 3032 { 3033 lgrp_load_t bestload, bestrload; 3034 int lgrpid_offset, lgrp_count; 3035 lgrp_id_t lgrpid, lgrpid_start; 3036 lpl_t *lpl, *bestlpl, *bestrlpl; 3037 klgrpset_t lgrpset; 3038 proc_t *p; 3039 3040 ASSERT(t != NULL); 3041 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3042 THREAD_LOCK_HELD(t)); 3043 ASSERT(cpupart != NULL); 3044 3045 p = t->t_procp; 3046 3047 /* A process should always be in an active partition */ 3048 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3049 3050 bestlpl = bestrlpl = NULL; 3051 bestload = bestrload = LGRP_LOADAVG_MAX; 3052 lgrpset = cpupart->cp_lgrpset; 3053 3054 switch (lgrp_choose_policy) { 3055 case LGRP_CHOOSE_RR: 3056 lgrpid = cpupart->cp_lgrp_hint; 3057 do { 3058 if (++lgrpid > lgrp_alloc_max) 3059 lgrpid = 0; 3060 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3061 3062 break; 3063 default: 3064 case LGRP_CHOOSE_TIME: 3065 case LGRP_CHOOSE_RANDOM: 3066 klgrpset_nlgrps(lgrpset, lgrp_count); 3067 lgrpid_offset = 3068 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3069 for (lgrpid = 0; ; lgrpid++) { 3070 if (klgrpset_ismember(lgrpset, lgrpid)) { 3071 if (--lgrpid_offset == 0) 3072 break; 3073 } 3074 } 3075 break; 3076 } 3077 3078 lgrpid_start = lgrpid; 3079 3080 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3081 lgrp_id_t, cpupart->cp_lgrp_hint); 3082 3083 /* 3084 * Use lgroup affinities (if any) to choose best lgroup 3085 * 3086 * NOTE: Assumes that thread is protected from going away and its 3087 * lgroup affinities won't change (ie. p_lock, or 3088 * thread_lock() being held and/or CPUs paused) 3089 */ 3090 if (t->t_lgrp_affinity) { 3091 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start); 3092 if (lpl != NULL) 3093 return (lpl); 3094 } 3095 3096 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3097 3098 do { 3099 pgcnt_t npgs; 3100 3101 /* 3102 * Skip any lgroups outside of thread's pset 3103 */ 3104 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3105 if (++lgrpid > lgrp_alloc_max) 3106 lgrpid = 0; /* wrap the search */ 3107 continue; 3108 } 3109 3110 /* 3111 * Skip any non-leaf lgroups 3112 */ 3113 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3114 continue; 3115 3116 /* 3117 * Skip any lgroups without enough free memory 3118 * (when threshold set to nonzero positive value) 3119 */ 3120 if (lgrp_mem_free_thresh > 0) { 3121 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3122 if (npgs < lgrp_mem_free_thresh) { 3123 if (++lgrpid > lgrp_alloc_max) 3124 lgrpid = 0; /* wrap the search */ 3125 continue; 3126 } 3127 } 3128 3129 lpl = &cpupart->cp_lgrploads[lgrpid]; 3130 if (klgrpset_isempty(p->p_lgrpset) || 3131 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3132 /* 3133 * Either this is a new process or the process already 3134 * has threads on this lgrp, so this is a preferred 3135 * lgroup for the thread. 3136 */ 3137 if (bestlpl == NULL || 3138 lpl_pick(lpl, bestlpl)) { 3139 bestload = lpl->lpl_loadavg; 3140 bestlpl = lpl; 3141 } 3142 } else { 3143 /* 3144 * The process doesn't have any threads on this lgrp, 3145 * but we're willing to consider this lgrp if the load 3146 * difference is big enough to justify splitting up 3147 * the process' threads. 3148 */ 3149 if (bestrlpl == NULL || 3150 lpl_pick(lpl, bestrlpl)) { 3151 bestrload = lpl->lpl_loadavg; 3152 bestrlpl = lpl; 3153 } 3154 } 3155 if (++lgrpid > lgrp_alloc_max) 3156 lgrpid = 0; /* wrap the search */ 3157 } while (lgrpid != lgrpid_start); 3158 3159 /* 3160 * Return root lgroup if threshold isn't set to maximum value and 3161 * lowest lgroup load average more than a certain threshold 3162 */ 3163 if (lgrp_load_thresh != UINT32_MAX && 3164 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3165 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3166 3167 /* 3168 * If all the lgroups over which the thread's process is spread are 3169 * heavily loaded, or otherwise undesirable, we'll consider placing 3170 * the thread on one of the other leaf lgroups in the thread's 3171 * partition. 3172 */ 3173 if ((bestlpl == NULL) || 3174 ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3175 (bestrload < bestload) && /* paranoid about wraparound */ 3176 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3177 bestload))) { 3178 bestlpl = bestrlpl; 3179 } 3180 3181 if (bestlpl == NULL) { 3182 /* 3183 * No lgroup looked particularly good, but we still 3184 * have to pick something. Go with the randomly selected 3185 * legal lgroup we started with above. 3186 */ 3187 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3188 } 3189 3190 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3191 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3192 3193 ASSERT(bestlpl->lpl_ncpu > 0); 3194 return (bestlpl); 3195 } 3196 3197 /* 3198 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing. 3199 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise. 3200 */ 3201 static int 3202 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3203 { 3204 lgrp_load_t l1, l2; 3205 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3206 3207 l1 = lpl1->lpl_loadavg; 3208 l2 = lpl2->lpl_loadavg; 3209 3210 if ((l1 + tolerance < l2) && (l1 < l2)) { 3211 /* lpl1 is significantly less loaded than lpl2 */ 3212 return (1); 3213 } 3214 3215 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3216 l1 + tolerance >= l2 && l1 < l2 && 3217 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3218 /* 3219 * lpl1's load is within the tolerance of lpl2. We're 3220 * willing to consider it be to better however if 3221 * it has been longer since we last homed a thread there 3222 */ 3223 return (1); 3224 } 3225 3226 return (0); 3227 } 3228 3229 /* 3230 * An LWP is expected to be assigned to an lgroup for at least this long 3231 * for its anticipatory load to be justified. NOTE that this value should 3232 * not be set extremely huge (say, larger than 100 years), to avoid problems 3233 * with overflow in the calculation that uses it. 3234 */ 3235 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3236 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3237 3238 /* 3239 * Routine to change a thread's lgroup affiliation. This routine updates 3240 * the thread's kthread_t struct and its process' proc_t struct to note the 3241 * thread's new lgroup affiliation, and its lgroup affinities. 3242 * 3243 * Note that this is the only routine that modifies a thread's t_lpl field, 3244 * and that adds in or removes anticipatory load. 3245 * 3246 * If the thread is exiting, newlpl is NULL. 3247 * 3248 * Locking: 3249 * The following lock must be held on entry: 3250 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3251 * doesn't get removed from t's partition 3252 * 3253 * This routine is not allowed to grab any locks, since it may be called 3254 * with cpus paused (such as from cpu_offline). 3255 */ 3256 void 3257 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3258 { 3259 proc_t *p; 3260 lpl_t *lpl, *oldlpl; 3261 lgrp_id_t oldid; 3262 kthread_t *tp; 3263 uint_t ncpu; 3264 lgrp_load_t old, new; 3265 3266 ASSERT(t); 3267 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3268 THREAD_LOCK_HELD(t)); 3269 3270 /* 3271 * If not changing lpls, just return 3272 */ 3273 if ((oldlpl = t->t_lpl) == newlpl) 3274 return; 3275 3276 /* 3277 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3278 * associated with process 0 rather than with its original process). 3279 */ 3280 if (t->t_proc_flag & TP_LWPEXIT) { 3281 if (newlpl != NULL) { 3282 t->t_lpl = newlpl; 3283 } 3284 return; 3285 } 3286 3287 p = ttoproc(t); 3288 3289 /* 3290 * If the thread had a previous lgroup, update its process' p_lgrpset 3291 * to account for it being moved from its old lgroup. 3292 */ 3293 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3294 (p->p_tlist != NULL)) { 3295 oldid = oldlpl->lpl_lgrpid; 3296 3297 if (newlpl != NULL) 3298 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3299 3300 if ((do_lgrpset_delete) && 3301 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3302 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3303 /* 3304 * Check if a thread other than the thread 3305 * that's moving is assigned to the same 3306 * lgroup as the thread that's moving. Note 3307 * that we have to compare lgroup IDs, rather 3308 * than simply comparing t_lpl's, since the 3309 * threads may belong to different partitions 3310 * but be assigned to the same lgroup. 3311 */ 3312 ASSERT(tp->t_lpl != NULL); 3313 3314 if ((tp != t) && 3315 (tp->t_lpl->lpl_lgrpid == oldid)) { 3316 /* 3317 * Another thread is assigned to the 3318 * same lgroup as the thread that's 3319 * moving, p_lgrpset doesn't change. 3320 */ 3321 break; 3322 } else if (tp == p->p_tlist) { 3323 /* 3324 * No other thread is assigned to the 3325 * same lgroup as the exiting thread, 3326 * clear the lgroup's bit in p_lgrpset. 3327 */ 3328 klgrpset_del(p->p_lgrpset, oldid); 3329 break; 3330 } 3331 } 3332 } 3333 3334 /* 3335 * If this thread was assigned to its old lgroup for such a 3336 * short amount of time that the anticipatory load that was 3337 * added on its behalf has aged very little, remove that 3338 * anticipatory load. 3339 */ 3340 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3341 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3342 lpl = oldlpl; 3343 for (;;) { 3344 do { 3345 old = new = lpl->lpl_loadavg; 3346 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3347 if (new > old) { 3348 /* 3349 * this can happen if the load 3350 * average was aged since we 3351 * added in the anticipatory 3352 * load 3353 */ 3354 new = 0; 3355 } 3356 } while (cas32( 3357 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3358 new) != old); 3359 3360 lpl = lpl->lpl_parent; 3361 if (lpl == NULL) 3362 break; 3363 3364 ncpu = lpl->lpl_ncpu; 3365 ASSERT(ncpu > 0); 3366 } 3367 } 3368 } 3369 /* 3370 * If the thread has a new lgroup (i.e. it's not exiting), update its 3371 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3372 * to its new lgroup to account for its move to its new lgroup. 3373 */ 3374 if (newlpl != NULL) { 3375 /* 3376 * This thread is moving to a new lgroup 3377 */ 3378 t->t_lpl = newlpl; 3379 3380 /* 3381 * Reflect move in load average of new lgroup 3382 * unless it is root lgroup 3383 */ 3384 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3385 return; 3386 3387 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3388 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3389 } 3390 3391 /* 3392 * It'll take some time for the load on the new lgroup 3393 * to reflect this thread's placement on it. We'd 3394 * like not, however, to have all threads between now 3395 * and then also piling on to this lgroup. To avoid 3396 * this pileup, we anticipate the load this thread 3397 * will generate on its new lgroup. The goal is to 3398 * make the lgroup's load appear as though the thread 3399 * had been there all along. We're very conservative 3400 * in calculating this anticipatory load, we assume 3401 * the worst case case (100% CPU-bound thread). This 3402 * may be modified in the future to be more accurate. 3403 */ 3404 lpl = newlpl; 3405 for (;;) { 3406 ncpu = lpl->lpl_ncpu; 3407 ASSERT(ncpu > 0); 3408 do { 3409 old = new = lpl->lpl_loadavg; 3410 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3411 /* 3412 * Check for overflow 3413 * Underflow not possible here 3414 */ 3415 if (new < old) 3416 new = UINT32_MAX; 3417 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 3418 new) != old); 3419 3420 lpl = lpl->lpl_parent; 3421 if (lpl == NULL) 3422 break; 3423 } 3424 t->t_anttime = gethrtime(); 3425 } 3426 } 3427 3428 /* 3429 * Return lgroup memory allocation policy given advice from madvise(3C) 3430 */ 3431 lgrp_mem_policy_t 3432 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3433 { 3434 switch (advice) { 3435 case MADV_ACCESS_LWP: 3436 return (LGRP_MEM_POLICY_NEXT); 3437 case MADV_ACCESS_MANY: 3438 return (LGRP_MEM_POLICY_RANDOM); 3439 default: 3440 return (lgrp_mem_policy_default(size, type)); 3441 } 3442 } 3443 3444 /* 3445 * Figure out default policy 3446 */ 3447 lgrp_mem_policy_t 3448 lgrp_mem_policy_default(size_t size, int type) 3449 { 3450 cpupart_t *cp; 3451 lgrp_mem_policy_t policy; 3452 size_t pset_mem_size; 3453 3454 /* 3455 * Randomly allocate memory across lgroups for shared memory 3456 * beyond a certain threshold 3457 */ 3458 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3459 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3460 /* 3461 * Get total memory size of current thread's pset 3462 */ 3463 kpreempt_disable(); 3464 cp = curthread->t_cpupart; 3465 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3466 kpreempt_enable(); 3467 3468 /* 3469 * Choose policy to randomly allocate memory across 3470 * lgroups in pset if it will fit and is not default 3471 * partition. Otherwise, allocate memory randomly 3472 * across machine. 3473 */ 3474 if (lgrp_mem_pset_aware && size < pset_mem_size) 3475 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3476 else 3477 policy = LGRP_MEM_POLICY_RANDOM; 3478 } else 3479 /* 3480 * Apply default policy for private memory and 3481 * shared memory under the respective random 3482 * threshold. 3483 */ 3484 policy = lgrp_mem_default_policy; 3485 3486 return (policy); 3487 } 3488 3489 /* 3490 * Get memory allocation policy for this segment 3491 */ 3492 lgrp_mem_policy_info_t * 3493 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3494 { 3495 lgrp_mem_policy_info_t *policy_info; 3496 extern struct seg_ops segspt_ops; 3497 extern struct seg_ops segspt_shmops; 3498 3499 /* 3500 * This is for binary compatibility to protect against third party 3501 * segment drivers which haven't recompiled to allow for 3502 * SEGOP_GETPOLICY() 3503 */ 3504 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3505 seg->s_ops != &segspt_shmops) 3506 return (NULL); 3507 3508 policy_info = NULL; 3509 if (seg->s_ops->getpolicy != NULL) 3510 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3511 3512 return (policy_info); 3513 } 3514 3515 /* 3516 * Set policy for allocating private memory given desired policy, policy info, 3517 * size in bytes of memory that policy is being applied. 3518 * Return 0 if policy wasn't set already and 1 if policy was set already 3519 */ 3520 int 3521 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3522 lgrp_mem_policy_info_t *policy_info, size_t size) 3523 { 3524 3525 ASSERT(policy_info != NULL); 3526 3527 if (policy == LGRP_MEM_POLICY_DEFAULT) 3528 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3529 3530 /* 3531 * Policy set already? 3532 */ 3533 if (policy == policy_info->mem_policy) 3534 return (1); 3535 3536 /* 3537 * Set policy 3538 */ 3539 policy_info->mem_policy = policy; 3540 policy_info->mem_reserved = 0; 3541 3542 return (0); 3543 } 3544 3545 3546 /* 3547 * Get shared memory allocation policy with given tree and offset 3548 */ 3549 lgrp_mem_policy_info_t * 3550 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3551 u_offset_t vn_off) 3552 { 3553 u_offset_t off; 3554 lgrp_mem_policy_info_t *policy_info; 3555 lgrp_shm_policy_seg_t *policy_seg; 3556 lgrp_shm_locality_t *shm_locality; 3557 avl_tree_t *tree; 3558 avl_index_t where; 3559 3560 /* 3561 * Get policy segment tree from anon_map or vnode and use specified 3562 * anon index or vnode offset as offset 3563 * 3564 * Assume that no lock needs to be held on anon_map or vnode, since 3565 * they should be protected by their reference count which must be 3566 * nonzero for an existing segment 3567 */ 3568 if (amp) { 3569 ASSERT(amp->refcnt != 0); 3570 shm_locality = amp->locality; 3571 if (shm_locality == NULL) 3572 return (NULL); 3573 tree = shm_locality->loc_tree; 3574 off = ptob(anon_index); 3575 } else if (vp) { 3576 shm_locality = vp->v_locality; 3577 if (shm_locality == NULL) 3578 return (NULL); 3579 ASSERT(shm_locality->loc_count != 0); 3580 tree = shm_locality->loc_tree; 3581 off = vn_off; 3582 } 3583 3584 if (tree == NULL) 3585 return (NULL); 3586 3587 /* 3588 * Lookup policy segment for offset into shared object and return 3589 * policy info 3590 */ 3591 rw_enter(&shm_locality->loc_lock, RW_READER); 3592 policy_info = NULL; 3593 policy_seg = avl_find(tree, &off, &where); 3594 if (policy_seg) 3595 policy_info = &policy_seg->shm_policy; 3596 rw_exit(&shm_locality->loc_lock); 3597 3598 return (policy_info); 3599 } 3600 3601 /* 3602 * Return lgroup to use for allocating memory 3603 * given the segment and address 3604 * 3605 * There isn't any mutual exclusion that exists between calls 3606 * to this routine and DR, so this routine and whomever calls it 3607 * should be mindful of the possibility that the lgrp returned 3608 * may be deleted. If this happens, dereferences of the lgrp 3609 * pointer will still be safe, but the resources in the lgrp will 3610 * be gone, and LGRP_EXISTS() will no longer be true. 3611 */ 3612 lgrp_t * 3613 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3614 { 3615 int i; 3616 lgrp_t *lgrp; 3617 klgrpset_t lgrpset; 3618 int lgrps_spanned; 3619 unsigned long off; 3620 lgrp_mem_policy_t policy; 3621 lgrp_mem_policy_info_t *policy_info; 3622 ushort_t random; 3623 int stat = 0; 3624 3625 /* 3626 * Just return null if the lgrp framework hasn't finished 3627 * initializing or if this is a UMA machine. 3628 */ 3629 if (nlgrps == 1 || !lgrp_initialized) 3630 return (lgrp_root); 3631 3632 /* 3633 * Get memory allocation policy for this segment 3634 */ 3635 policy = lgrp_mem_default_policy; 3636 if (seg != NULL) { 3637 if (seg->s_as == &kas) { 3638 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3639 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3640 policy = LGRP_MEM_POLICY_RANDOM; 3641 } else { 3642 policy_info = lgrp_mem_policy_get(seg, vaddr); 3643 if (policy_info != NULL) 3644 policy = policy_info->mem_policy; 3645 } 3646 } 3647 lgrpset = 0; 3648 3649 /* 3650 * Initialize lgroup to home by default 3651 */ 3652 lgrp = lgrp_home_lgrp(); 3653 3654 /* 3655 * When homing threads on root lgrp, override default memory 3656 * allocation policies with root lgroup memory allocation policy 3657 */ 3658 if (lgrp == lgrp_root) 3659 policy = lgrp_mem_policy_root; 3660 3661 /* 3662 * Implement policy 3663 */ 3664 switch (policy) { 3665 case LGRP_MEM_POLICY_NEXT_CPU: 3666 3667 /* 3668 * Return lgroup of current CPU which faulted on memory 3669 * If the CPU isn't currently in an lgrp, then opt to 3670 * allocate from the root. 3671 * 3672 * Kernel preemption needs to be disabled here to prevent 3673 * the current CPU from going away before lgrp is found. 3674 */ 3675 if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 3676 lgrp = lgrp_root; 3677 } else { 3678 kpreempt_disable(); 3679 lgrp = lgrp_cpu_to_lgrp(CPU); 3680 kpreempt_enable(); 3681 } 3682 break; 3683 3684 case LGRP_MEM_POLICY_NEXT: 3685 case LGRP_MEM_POLICY_DEFAULT: 3686 default: 3687 3688 /* 3689 * Just return current thread's home lgroup 3690 * for default policy (next touch) 3691 * If the thread is homed to the root, 3692 * then the default policy is random across lgroups. 3693 * Fallthrough to the random case. 3694 */ 3695 if (lgrp != lgrp_root) { 3696 if (policy == LGRP_MEM_POLICY_NEXT) 3697 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3698 else 3699 lgrp_stat_add(lgrp->lgrp_id, 3700 LGRP_NUM_DEFAULT, 1); 3701 break; 3702 } 3703 /* LINTED fallthrough on case statement */ 3704 case LGRP_MEM_POLICY_RANDOM: 3705 3706 /* 3707 * Return a random leaf lgroup with memory 3708 */ 3709 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3710 /* 3711 * Count how many lgroups are spanned 3712 */ 3713 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3714 3715 /* 3716 * There may be no memnodes in the root lgroup during DR copy 3717 * rename on a system with only two boards (memnodes) 3718 * configured. In this case just return the root lgrp. 3719 */ 3720 if (lgrps_spanned == 0) { 3721 lgrp = lgrp_root; 3722 break; 3723 } 3724 3725 /* 3726 * Pick a random offset within lgroups spanned 3727 * and return lgroup at that offset 3728 */ 3729 random = (ushort_t)gethrtime() >> 4; 3730 off = random % lgrps_spanned; 3731 ASSERT(off <= lgrp_alloc_max); 3732 3733 for (i = 0; i <= lgrp_alloc_max; i++) { 3734 if (!klgrpset_ismember(lgrpset, i)) 3735 continue; 3736 if (off) 3737 off--; 3738 else { 3739 lgrp = lgrp_table[i]; 3740 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3741 1); 3742 break; 3743 } 3744 } 3745 break; 3746 3747 case LGRP_MEM_POLICY_RANDOM_PROC: 3748 3749 /* 3750 * Grab copy of bitmask of lgroups spanned by 3751 * this process 3752 */ 3753 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3754 stat = LGRP_NUM_RANDOM_PROC; 3755 3756 /* LINTED fallthrough on case statement */ 3757 case LGRP_MEM_POLICY_RANDOM_PSET: 3758 3759 if (!stat) 3760 stat = LGRP_NUM_RANDOM_PSET; 3761 3762 if (klgrpset_isempty(lgrpset)) { 3763 /* 3764 * Grab copy of bitmask of lgroups spanned by 3765 * this processor set 3766 */ 3767 kpreempt_disable(); 3768 klgrpset_copy(lgrpset, 3769 curthread->t_cpupart->cp_lgrpset); 3770 kpreempt_enable(); 3771 } 3772 3773 /* 3774 * Count how many lgroups are spanned 3775 */ 3776 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3777 ASSERT(lgrps_spanned <= nlgrps); 3778 3779 /* 3780 * Probably lgrps_spanned should be always non-zero, but to be 3781 * on the safe side we return lgrp_root if it is empty. 3782 */ 3783 if (lgrps_spanned == 0) { 3784 lgrp = lgrp_root; 3785 break; 3786 } 3787 3788 /* 3789 * Pick a random offset within lgroups spanned 3790 * and return lgroup at that offset 3791 */ 3792 random = (ushort_t)gethrtime() >> 4; 3793 off = random % lgrps_spanned; 3794 ASSERT(off <= lgrp_alloc_max); 3795 3796 for (i = 0; i <= lgrp_alloc_max; i++) { 3797 if (!klgrpset_ismember(lgrpset, i)) 3798 continue; 3799 if (off) 3800 off--; 3801 else { 3802 lgrp = lgrp_table[i]; 3803 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3804 1); 3805 break; 3806 } 3807 } 3808 break; 3809 3810 case LGRP_MEM_POLICY_ROUNDROBIN: 3811 3812 /* 3813 * Use offset within segment to determine 3814 * offset from home lgroup to choose for 3815 * next lgroup to allocate memory from 3816 */ 3817 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3818 (lgrp_alloc_max + 1); 3819 3820 kpreempt_disable(); 3821 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3822 i = lgrp->lgrp_id; 3823 kpreempt_enable(); 3824 3825 while (off > 0) { 3826 i = (i + 1) % (lgrp_alloc_max + 1); 3827 lgrp = lgrp_table[i]; 3828 if (klgrpset_ismember(lgrpset, i)) 3829 off--; 3830 } 3831 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3832 3833 break; 3834 } 3835 3836 ASSERT(lgrp != NULL); 3837 return (lgrp); 3838 } 3839 3840 /* 3841 * Return the number of pages in an lgroup 3842 * 3843 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3844 * could cause tests that rely on the numat driver to fail.... 3845 */ 3846 pgcnt_t 3847 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3848 { 3849 lgrp_t *lgrp; 3850 3851 lgrp = lgrp_table[lgrpid]; 3852 if (!LGRP_EXISTS(lgrp) || 3853 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3854 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3855 return (0); 3856 3857 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3858 } 3859 3860 /* 3861 * Initialize lgroup shared memory allocation policy support 3862 */ 3863 void 3864 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3865 { 3866 lgrp_shm_locality_t *shm_locality; 3867 3868 /* 3869 * Initialize locality field in anon_map 3870 * Don't need any locks because this is called when anon_map is 3871 * allocated, but not used anywhere yet. 3872 */ 3873 if (amp) { 3874 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3875 if (amp->locality == NULL) { 3876 /* 3877 * Allocate and initialize shared memory locality info 3878 * and set anon_map locality pointer to it 3879 * Drop lock across kmem_alloc(KM_SLEEP) 3880 */ 3881 ANON_LOCK_EXIT(&->a_rwlock); 3882 shm_locality = kmem_alloc(sizeof (*shm_locality), 3883 KM_SLEEP); 3884 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3885 NULL); 3886 shm_locality->loc_count = 1; /* not used for amp */ 3887 shm_locality->loc_tree = NULL; 3888 3889 /* 3890 * Reacquire lock and check to see whether anyone beat 3891 * us to initializing the locality info 3892 */ 3893 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3894 if (amp->locality != NULL) { 3895 rw_destroy(&shm_locality->loc_lock); 3896 kmem_free(shm_locality, 3897 sizeof (*shm_locality)); 3898 } else 3899 amp->locality = shm_locality; 3900 } 3901 ANON_LOCK_EXIT(&->a_rwlock); 3902 return; 3903 } 3904 3905 /* 3906 * Allocate shared vnode policy info if vnode is not locality aware yet 3907 */ 3908 mutex_enter(&vp->v_lock); 3909 if ((vp->v_flag & V_LOCALITY) == 0) { 3910 /* 3911 * Allocate and initialize shared memory locality info 3912 */ 3913 mutex_exit(&vp->v_lock); 3914 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3915 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3916 shm_locality->loc_count = 1; 3917 shm_locality->loc_tree = NULL; 3918 3919 /* 3920 * Point vnode locality field at shared vnode policy info 3921 * and set locality aware flag in vnode 3922 */ 3923 mutex_enter(&vp->v_lock); 3924 if ((vp->v_flag & V_LOCALITY) == 0) { 3925 vp->v_locality = shm_locality; 3926 vp->v_flag |= V_LOCALITY; 3927 } else { 3928 /* 3929 * Lost race so free locality info and increment count. 3930 */ 3931 rw_destroy(&shm_locality->loc_lock); 3932 kmem_free(shm_locality, sizeof (*shm_locality)); 3933 shm_locality = vp->v_locality; 3934 shm_locality->loc_count++; 3935 } 3936 mutex_exit(&vp->v_lock); 3937 3938 return; 3939 } 3940 3941 /* 3942 * Increment reference count of number of segments mapping this vnode 3943 * shared 3944 */ 3945 shm_locality = vp->v_locality; 3946 shm_locality->loc_count++; 3947 mutex_exit(&vp->v_lock); 3948 } 3949 3950 /* 3951 * Destroy the given shared memory policy segment tree 3952 */ 3953 void 3954 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3955 { 3956 lgrp_shm_policy_seg_t *cur; 3957 lgrp_shm_policy_seg_t *next; 3958 3959 if (tree == NULL) 3960 return; 3961 3962 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3963 while (cur != NULL) { 3964 next = AVL_NEXT(tree, cur); 3965 avl_remove(tree, cur); 3966 kmem_free(cur, sizeof (*cur)); 3967 cur = next; 3968 } 3969 kmem_free(tree, sizeof (avl_tree_t)); 3970 } 3971 3972 /* 3973 * Uninitialize lgroup shared memory allocation policy support 3974 */ 3975 void 3976 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 3977 { 3978 lgrp_shm_locality_t *shm_locality; 3979 3980 /* 3981 * For anon_map, deallocate shared memory policy tree and 3982 * zero locality field 3983 * Don't need any locks because anon_map is being freed 3984 */ 3985 if (amp) { 3986 if (amp->locality == NULL) 3987 return; 3988 shm_locality = amp->locality; 3989 shm_locality->loc_count = 0; /* not really used for amp */ 3990 rw_destroy(&shm_locality->loc_lock); 3991 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3992 kmem_free(shm_locality, sizeof (*shm_locality)); 3993 amp->locality = 0; 3994 return; 3995 } 3996 3997 /* 3998 * For vnode, decrement reference count of segments mapping this vnode 3999 * shared and delete locality info if reference count drops to 0 4000 */ 4001 mutex_enter(&vp->v_lock); 4002 shm_locality = vp->v_locality; 4003 shm_locality->loc_count--; 4004 4005 if (shm_locality->loc_count == 0) { 4006 rw_destroy(&shm_locality->loc_lock); 4007 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4008 kmem_free(shm_locality, sizeof (*shm_locality)); 4009 vp->v_locality = 0; 4010 vp->v_flag &= ~V_LOCALITY; 4011 } 4012 mutex_exit(&vp->v_lock); 4013 } 4014 4015 /* 4016 * Compare two shared memory policy segments 4017 * Used by AVL tree code for searching 4018 */ 4019 int 4020 lgrp_shm_policy_compar(const void *x, const void *y) 4021 { 4022 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 4023 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 4024 4025 if (a->shm_off < b->shm_off) 4026 return (-1); 4027 if (a->shm_off >= b->shm_off + b->shm_size) 4028 return (1); 4029 return (0); 4030 } 4031 4032 /* 4033 * Concatenate seg1 with seg2 and remove seg2 4034 */ 4035 static int 4036 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 4037 lgrp_shm_policy_seg_t *seg2) 4038 { 4039 if (!seg1 || !seg2 || 4040 seg1->shm_off + seg1->shm_size != seg2->shm_off || 4041 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 4042 return (-1); 4043 4044 seg1->shm_size += seg2->shm_size; 4045 avl_remove(tree, seg2); 4046 kmem_free(seg2, sizeof (*seg2)); 4047 return (0); 4048 } 4049 4050 /* 4051 * Split segment at given offset and return rightmost (uppermost) segment 4052 * Assumes that there are no overlapping segments 4053 */ 4054 static lgrp_shm_policy_seg_t * 4055 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 4056 u_offset_t off) 4057 { 4058 lgrp_shm_policy_seg_t *newseg; 4059 avl_index_t where; 4060 4061 ASSERT(seg != NULL); 4062 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4063 4064 if (!seg || off < seg->shm_off || off > seg->shm_off + 4065 seg->shm_size) 4066 return (NULL); 4067 4068 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4069 return (seg); 4070 4071 /* 4072 * Adjust size of left segment and allocate new (right) segment 4073 */ 4074 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4075 newseg->shm_policy = seg->shm_policy; 4076 newseg->shm_off = off; 4077 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4078 seg->shm_size = off - seg->shm_off; 4079 4080 /* 4081 * Find where to insert new segment in AVL tree and insert it 4082 */ 4083 (void) avl_find(tree, &off, &where); 4084 avl_insert(tree, newseg, where); 4085 4086 return (newseg); 4087 } 4088 4089 /* 4090 * Set shared memory allocation policy on specified shared object at given 4091 * offset and length 4092 * 4093 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4094 * -1 if can't set policy. 4095 */ 4096 int 4097 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4098 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4099 { 4100 u_offset_t eoff; 4101 lgrp_shm_policy_seg_t *next; 4102 lgrp_shm_policy_seg_t *newseg; 4103 u_offset_t off; 4104 u_offset_t oldeoff; 4105 lgrp_shm_policy_seg_t *prev; 4106 int retval; 4107 lgrp_shm_policy_seg_t *seg; 4108 lgrp_shm_locality_t *shm_locality; 4109 avl_tree_t *tree; 4110 avl_index_t where; 4111 4112 ASSERT(amp || vp); 4113 ASSERT((len & PAGEOFFSET) == 0); 4114 4115 if (len == 0) 4116 return (-1); 4117 4118 retval = 0; 4119 4120 /* 4121 * Get locality info and starting offset into shared object 4122 * Try anon map first and then vnode 4123 * Assume that no locks need to be held on anon_map or vnode, since 4124 * it should be protected by its reference count which must be nonzero 4125 * for an existing segment. 4126 */ 4127 if (amp) { 4128 /* 4129 * Get policy info from anon_map 4130 * 4131 */ 4132 ASSERT(amp->refcnt != 0); 4133 if (amp->locality == NULL) 4134 lgrp_shm_policy_init(amp, NULL); 4135 shm_locality = amp->locality; 4136 off = ptob(anon_index); 4137 } else if (vp) { 4138 /* 4139 * Get policy info from vnode 4140 */ 4141 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4142 lgrp_shm_policy_init(NULL, vp); 4143 shm_locality = vp->v_locality; 4144 ASSERT(shm_locality->loc_count != 0); 4145 off = vn_off; 4146 } else 4147 return (-1); 4148 4149 ASSERT((off & PAGEOFFSET) == 0); 4150 4151 /* 4152 * Figure out default policy 4153 */ 4154 if (policy == LGRP_MEM_POLICY_DEFAULT) 4155 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4156 4157 /* 4158 * Create AVL tree if there isn't one yet 4159 * and set locality field to point at it 4160 */ 4161 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4162 tree = shm_locality->loc_tree; 4163 if (!tree) { 4164 rw_exit(&shm_locality->loc_lock); 4165 4166 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4167 4168 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4169 if (shm_locality->loc_tree == NULL) { 4170 avl_create(tree, lgrp_shm_policy_compar, 4171 sizeof (lgrp_shm_policy_seg_t), 4172 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4173 shm_locality->loc_tree = tree; 4174 } else { 4175 /* 4176 * Another thread managed to set up the tree 4177 * before we could. Free the tree we allocated 4178 * and use the one that's already there. 4179 */ 4180 kmem_free(tree, sizeof (*tree)); 4181 tree = shm_locality->loc_tree; 4182 } 4183 } 4184 4185 /* 4186 * Set policy 4187 * 4188 * Need to maintain hold on writer's lock to keep tree from 4189 * changing out from under us 4190 */ 4191 while (len != 0) { 4192 /* 4193 * Find policy segment for specified offset into shared object 4194 */ 4195 seg = avl_find(tree, &off, &where); 4196 4197 /* 4198 * Didn't find any existing segment that contains specified 4199 * offset, so allocate new segment, insert it, and concatenate 4200 * with adjacent segments if possible 4201 */ 4202 if (seg == NULL) { 4203 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4204 KM_SLEEP); 4205 newseg->shm_policy.mem_policy = policy; 4206 newseg->shm_policy.mem_reserved = 0; 4207 newseg->shm_off = off; 4208 avl_insert(tree, newseg, where); 4209 4210 /* 4211 * Check to see whether new segment overlaps with next 4212 * one, set length of new segment accordingly, and 4213 * calculate remaining length and next offset 4214 */ 4215 seg = AVL_NEXT(tree, newseg); 4216 if (seg == NULL || off + len <= seg->shm_off) { 4217 newseg->shm_size = len; 4218 len = 0; 4219 } else { 4220 newseg->shm_size = seg->shm_off - off; 4221 off = seg->shm_off; 4222 len -= newseg->shm_size; 4223 } 4224 4225 /* 4226 * Try to concatenate new segment with next and 4227 * previous ones, since they might have the same policy 4228 * now. Grab previous and next segments first because 4229 * they will change on concatenation. 4230 */ 4231 prev = AVL_PREV(tree, newseg); 4232 next = AVL_NEXT(tree, newseg); 4233 (void) lgrp_shm_policy_concat(tree, newseg, next); 4234 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4235 4236 continue; 4237 } 4238 4239 eoff = off + len; 4240 oldeoff = seg->shm_off + seg->shm_size; 4241 4242 /* 4243 * Policy set already? 4244 */ 4245 if (policy == seg->shm_policy.mem_policy) { 4246 /* 4247 * Nothing left to do if offset and length 4248 * fall within this segment 4249 */ 4250 if (eoff <= oldeoff) { 4251 retval = 1; 4252 break; 4253 } else { 4254 len = eoff - oldeoff; 4255 off = oldeoff; 4256 continue; 4257 } 4258 } 4259 4260 /* 4261 * Specified offset and length match existing segment exactly 4262 */ 4263 if (off == seg->shm_off && len == seg->shm_size) { 4264 /* 4265 * Set policy and update current length 4266 */ 4267 seg->shm_policy.mem_policy = policy; 4268 seg->shm_policy.mem_reserved = 0; 4269 len = 0; 4270 4271 /* 4272 * Try concatenating new segment with previous and next 4273 * segments, since they might have the same policy now. 4274 * Grab previous and next segments first because they 4275 * will change on concatenation. 4276 */ 4277 prev = AVL_PREV(tree, seg); 4278 next = AVL_NEXT(tree, seg); 4279 (void) lgrp_shm_policy_concat(tree, seg, next); 4280 (void) lgrp_shm_policy_concat(tree, prev, seg); 4281 } else { 4282 /* 4283 * Specified offset and length only apply to part of 4284 * existing segment 4285 */ 4286 4287 /* 4288 * New segment starts in middle of old one, so split 4289 * new one off near beginning of old one 4290 */ 4291 newseg = NULL; 4292 if (off > seg->shm_off) { 4293 newseg = lgrp_shm_policy_split(tree, seg, off); 4294 4295 /* 4296 * New segment ends where old one did, so try 4297 * to concatenate with next segment 4298 */ 4299 if (eoff == oldeoff) { 4300 newseg->shm_policy.mem_policy = policy; 4301 newseg->shm_policy.mem_reserved = 0; 4302 (void) lgrp_shm_policy_concat(tree, 4303 newseg, AVL_NEXT(tree, newseg)); 4304 break; 4305 } 4306 } 4307 4308 /* 4309 * New segment ends before old one, so split off end of 4310 * old one 4311 */ 4312 if (eoff < oldeoff) { 4313 if (newseg) { 4314 (void) lgrp_shm_policy_split(tree, 4315 newseg, eoff); 4316 newseg->shm_policy.mem_policy = policy; 4317 newseg->shm_policy.mem_reserved = 0; 4318 } else { 4319 (void) lgrp_shm_policy_split(tree, seg, 4320 eoff); 4321 seg->shm_policy.mem_policy = policy; 4322 seg->shm_policy.mem_reserved = 0; 4323 } 4324 4325 if (off == seg->shm_off) 4326 (void) lgrp_shm_policy_concat(tree, 4327 AVL_PREV(tree, seg), seg); 4328 break; 4329 } 4330 4331 /* 4332 * Calculate remaining length and next offset 4333 */ 4334 len = eoff - oldeoff; 4335 off = oldeoff; 4336 } 4337 } 4338 4339 rw_exit(&shm_locality->loc_lock); 4340 return (retval); 4341 } 4342 4343 /* 4344 * Return the best memnode from which to allocate memory given 4345 * an lgroup. 4346 * 4347 * "c" is for cookie, which is good enough for me. 4348 * It references a cookie struct that should be zero'ed to initialize. 4349 * The cookie should live on the caller's stack. 4350 * 4351 * The routine returns -1 when: 4352 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4353 * - traverse is 1, and all the memnodes in the system have been 4354 * returned. 4355 */ 4356 int 4357 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4358 { 4359 lgrp_t *lp = c->lmc_lgrp; 4360 mnodeset_t nodes = c->lmc_nodes; 4361 int cnt = c->lmc_cnt; 4362 int offset, mnode; 4363 4364 extern int max_mem_nodes; 4365 4366 /* 4367 * If the set is empty, and the caller is willing, traverse 4368 * up the hierarchy until we find a non-empty set. 4369 */ 4370 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4371 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4372 ((lp = lp->lgrp_parent) == NULL)) 4373 return (-1); 4374 4375 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4376 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4377 } 4378 4379 /* 4380 * Select a memnode by picking one at a "random" offset. 4381 * Because of DR, memnodes can come and go at any time. 4382 * This code must be able to cope with the possibility 4383 * that the nodes count "cnt" is inconsistent with respect 4384 * to the number of elements actually in "nodes", and 4385 * therefore that the offset chosen could be greater than 4386 * the number of elements in the set (some memnodes may 4387 * have dissapeared just before cnt was read). 4388 * If this happens, the search simply wraps back to the 4389 * beginning of the set. 4390 */ 4391 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4392 offset = c->lmc_rand % cnt; 4393 do { 4394 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4395 if (nodes & ((mnodeset_t)1 << mnode)) 4396 if (!offset--) 4397 break; 4398 } while (mnode >= max_mem_nodes); 4399 4400 /* Found a node. Store state before returning. */ 4401 c->lmc_lgrp = lp; 4402 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4403 c->lmc_cnt = cnt - 1; 4404 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4405 c->lmc_ntried++; 4406 4407 return (mnode); 4408 } 4409