1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Basic NUMA support in terms of locality groups 31 * 32 * Solaris needs to know which CPUs, memory, etc. are near each other to 33 * provide good performance on NUMA machines by optimizing for locality. 34 * In order to do this, a new abstraction called a "locality group (lgroup)" 35 * has been introduced to keep track of which CPU-like and memory-like hardware 36 * resources are close to each other. Currently, latency is the only measure 37 * used to determine how to group hardware resources into lgroups, but this 38 * does not limit the groupings to be based solely on latency. Other factors 39 * may be used to determine the groupings in the future. 40 * 41 * Lgroups are organized into a hieararchy or topology that represents the 42 * latency topology of the machine. There is always at least a root lgroup in 43 * the system. It represents all the hardware resources in the machine at a 44 * latency big enough that any hardware resource can at least access any other 45 * hardware resource within that latency. A Uniform Memory Access (UMA) 46 * machine is represented with one lgroup (the root). In contrast, a NUMA 47 * machine is represented at least by the root lgroup and some number of leaf 48 * lgroups where the leaf lgroups contain the hardware resources within the 49 * least latency of each other and the root lgroup still contains all the 50 * resources in the machine. Some number of intermediate lgroups may exist 51 * which represent more levels of locality than just the local latency of the 52 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups 53 * (eg. root and intermediate lgroups) contain the next nearest resources to 54 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup 55 * to the root lgroup shows the hardware resources from closest to farthest 56 * from the leaf lgroup such that each successive ancestor lgroup contains 57 * the next nearest resources at the next level of locality from the previous. 58 * 59 * The kernel uses the lgroup abstraction to know how to allocate resources 60 * near a given process/thread. At fork() and lwp/thread_create() time, a 61 * "home" lgroup is chosen for a thread. This is done by picking the lgroup 62 * with the lowest load average. Binding to a processor or processor set will 63 * change the home lgroup for a thread. The scheduler has been modified to try 64 * to dispatch a thread on a CPU in its home lgroup. Physical memory 65 * allocation is lgroup aware too, so memory will be allocated from the current 66 * thread's home lgroup if possible. If the desired resources are not 67 * available, the kernel traverses the lgroup hierarchy going to the parent 68 * lgroup to find resources at the next level of locality until it reaches the 69 * root lgroup. 70 */ 71 72 #include <sys/lgrp.h> 73 #include <sys/lgrp_user.h> 74 #include <sys/types.h> 75 #include <sys/mman.h> 76 #include <sys/param.h> 77 #include <sys/var.h> 78 #include <sys/thread.h> 79 #include <sys/cpuvar.h> 80 #include <sys/cpupart.h> 81 #include <sys/kmem.h> 82 #include <vm/seg.h> 83 #include <vm/seg_kmem.h> 84 #include <vm/seg_spt.h> 85 #include <vm/seg_vn.h> 86 #include <vm/as.h> 87 #include <sys/atomic.h> 88 #include <sys/systm.h> 89 #include <sys/errno.h> 90 #include <sys/cmn_err.h> 91 #include <sys/kstat.h> 92 #include <sys/sysmacros.h> 93 #include <sys/chip.h> 94 #include <sys/promif.h> 95 #include <sys/sdt.h> 96 97 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */ 98 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */ 99 /* indexed by lgrp_id */ 100 int nlgrps; /* number of lgroups in machine */ 101 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */ 102 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */ 103 104 /* 105 * Kstat data for lgroups. 106 * 107 * Actual kstat data is collected in lgrp_stats array. 108 * The lgrp_kstat_data array of named kstats is used to extract data from 109 * lgrp_stats and present it to kstat framework. It is protected from partallel 110 * modifications by lgrp_kstat_mutex. This may cause some contention when 111 * several kstat commands run in parallel but this is not the 112 * performance-critical path. 113 */ 114 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */ 115 116 /* 117 * Declare kstat names statically for enums as defined in the header file. 118 */ 119 LGRP_KSTAT_NAMES; 120 121 static void lgrp_kstat_init(void); 122 static int lgrp_kstat_extract(kstat_t *, int); 123 static void lgrp_kstat_reset(lgrp_id_t); 124 125 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS]; 126 static kmutex_t lgrp_kstat_mutex; 127 128 129 /* 130 * max number of lgroups supported by the platform 131 */ 132 int nlgrpsmax = 0; 133 134 /* 135 * The root lgroup. Represents the set of resources at the system wide 136 * level of locality. 137 */ 138 lgrp_t *lgrp_root = NULL; 139 140 /* 141 * During system bootstrap cp_default does not contain the list of lgrp load 142 * averages (cp_lgrploads). The list is allocated after the first CPU is brought 143 * on-line when cp_default is initialized by cpupart_initialize_default(). 144 * Configuring CPU0 may create a two-level topology with root and one leaf node 145 * containing CPU0. This topology is initially constructed in a special 146 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned 147 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used 148 * for all lpl operations until cp_default is fully constructed. 149 * 150 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other 151 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to 152 * the first element of lpl_bootstrap_list. 153 * 154 * CPUs that are added to the system, but have not yet been assigned to an 155 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because 156 * on some architectures (x86) it's possible for the slave CPU startup thread 157 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init(). 158 */ 159 #define LPL_BOOTSTRAP_SIZE 2 160 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE]; 161 lpl_t *lpl_bootstrap; 162 163 /* 164 * If cp still references the bootstrap lpl, it has not yet been added to 165 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where 166 * a thread is trying to allocate memory close to a CPU that has no lgrp. 167 */ 168 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap) 169 170 static lgrp_t lroot; 171 172 173 /* 174 * Size, in bytes, beyond which random memory allocation policy is applied 175 * to non-shared memory. Default is the maximum size, so random memory 176 * allocation won't be used for non-shared memory by default. 177 */ 178 size_t lgrp_privm_random_thresh = (size_t)(-1); 179 180 /* 181 * Size, in bytes, beyond which random memory allocation policy is applied to 182 * shared memory. Default is 8MB (2 ISM pages). 183 */ 184 size_t lgrp_shm_random_thresh = 8*1024*1024; 185 186 /* 187 * Whether to do processor set aware memory allocation by default 188 */ 189 int lgrp_mem_pset_aware = 0; 190 191 /* 192 * Set the default memory allocation policy for root lgroup 193 */ 194 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM; 195 196 /* 197 * Set the default memory allocation policy. For most platforms, 198 * next touch is sufficient, but some platforms may wish to override 199 * this. 200 */ 201 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 202 203 204 /* 205 * lgroup CPU event handlers 206 */ 207 static void lgrp_cpu_init(struct cpu *); 208 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t); 209 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *); 210 211 static void lgrp_latency_change(u_longlong_t, u_longlong_t); 212 213 /* 214 * lgroup memory event handlers 215 */ 216 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t); 217 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t); 218 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t); 219 220 /* 221 * lgroup CPU partition event handlers 222 */ 223 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t); 224 static void lgrp_part_del_cpu(struct cpu *); 225 226 static void lgrp_root_init(void); 227 228 /* 229 * lpl topology 230 */ 231 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *); 232 static void lpl_clear(lpl_t *); 233 static void lpl_leaf_insert(lpl_t *, struct cpupart *); 234 static void lpl_leaf_remove(lpl_t *, struct cpupart *); 235 static void lpl_rset_add(lpl_t *, lpl_t *); 236 static void lpl_rset_del(lpl_t *, lpl_t *); 237 static int lpl_rset_contains(lpl_t *, lpl_t *); 238 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *); 239 static void lpl_child_update(lpl_t *, struct cpupart *); 240 static int lpl_pick(lpl_t *, lpl_t *); 241 static void lpl_verify_wrapper(struct cpupart *); 242 243 /* 244 * defines for lpl topology verifier return codes 245 */ 246 247 #define LPL_TOPO_CORRECT 0 248 #define LPL_TOPO_PART_HAS_NO_LPL -1 249 #define LPL_TOPO_CPUS_NOT_EMPTY -2 250 #define LPL_TOPO_LGRP_MISMATCH -3 251 #define LPL_TOPO_MISSING_PARENT -4 252 #define LPL_TOPO_PARENT_MISMATCH -5 253 #define LPL_TOPO_BAD_CPUCNT -6 254 #define LPL_TOPO_RSET_MISMATCH -7 255 #define LPL_TOPO_LPL_ORPHANED -8 256 #define LPL_TOPO_LPL_BAD_NCPU -9 257 #define LPL_TOPO_RSET_MSSNG_LF -10 258 #define LPL_TOPO_CPU_HAS_BAD_LPL -11 259 #define LPL_TOPO_BOGUS_HINT -12 260 #define LPL_TOPO_NONLEAF_HAS_CPUS -13 261 #define LPL_TOPO_LGRP_NOT_LEAF -14 262 #define LPL_TOPO_BAD_RSETCNT -15 263 264 /* 265 * Return whether lgroup optimizations should be enabled on this system 266 */ 267 int 268 lgrp_optimizations(void) 269 { 270 /* 271 * System must have more than 2 lgroups to enable lgroup optimizations 272 * 273 * XXX This assumes that a 2 lgroup system has an empty root lgroup 274 * with one child lgroup containing all the resources. A 2 lgroup 275 * system with a root lgroup directly containing CPUs or memory might 276 * need lgroup optimizations with its child lgroup, but there 277 * isn't such a machine for now.... 278 */ 279 if (nlgrps > 2) 280 return (1); 281 282 return (0); 283 } 284 285 /* 286 * Build full lgroup topology 287 */ 288 static void 289 lgrp_root_init(void) 290 { 291 lgrp_handle_t hand; 292 int i; 293 lgrp_id_t id; 294 295 /* 296 * Create the "root" lgroup 297 */ 298 ASSERT(nlgrps == 0); 299 id = nlgrps++; 300 301 lgrp_root = &lroot; 302 303 lgrp_root->lgrp_cpu = NULL; 304 lgrp_root->lgrp_mnodes = 0; 305 lgrp_root->lgrp_nmnodes = 0; 306 hand = lgrp_plat_root_hand(); 307 lgrp_root->lgrp_plathand = hand; 308 309 lgrp_root->lgrp_id = id; 310 lgrp_root->lgrp_cpucnt = 0; 311 lgrp_root->lgrp_childcnt = 0; 312 klgrpset_clear(lgrp_root->lgrp_children); 313 klgrpset_clear(lgrp_root->lgrp_leaves); 314 lgrp_root->lgrp_parent = NULL; 315 lgrp_root->lgrp_chips = NULL; 316 lgrp_root->lgrp_chipcnt = 0; 317 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand); 318 319 for (i = 0; i < LGRP_RSRC_COUNT; i++) 320 klgrpset_clear(lgrp_root->lgrp_set[i]); 321 322 lgrp_root->lgrp_kstat = NULL; 323 324 lgrp_table[id] = lgrp_root; 325 326 /* 327 * Setup initial lpl list for CPU0 and initial t0 home. 328 * The only lpl space we have so far is lpl_bootstrap. It is used for 329 * all topology operations until cp_default is initialized at which 330 * point t0.t_lpl will be updated. 331 */ 332 lpl_bootstrap = lpl_bootstrap_list; 333 t0.t_lpl = lpl_bootstrap; 334 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE; 335 lpl_bootstrap_list[1].lpl_lgrpid = 1; 336 cp_default.cp_lgrploads = lpl_bootstrap; 337 } 338 339 /* 340 * Initialize the lgroup framework and allow the platform to do the same 341 */ 342 void 343 lgrp_init(void) 344 { 345 /* 346 * Initialize the platform 347 */ 348 lgrp_plat_init(); 349 350 /* 351 * Set max number of lgroups supported on this platform which must be 352 * less than the max number of lgroups supported by the common lgroup 353 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.) 354 */ 355 nlgrpsmax = lgrp_plat_max_lgrps(); 356 ASSERT(nlgrpsmax <= NLGRPS_MAX); 357 } 358 359 /* 360 * Create the root and cpu0's lgroup, and set t0's home. 361 */ 362 void 363 lgrp_setup(void) 364 { 365 /* 366 * Setup the root lgroup 367 */ 368 lgrp_root_init(); 369 370 /* 371 * Add cpu0 to an lgroup 372 */ 373 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0); 374 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0); 375 } 376 377 /* 378 * Lgroup initialization is split in two parts. The first part 379 * (lgrp_main_init()) is called right before start_other_cpus() in main. The 380 * second part (lgrp_main_mp_init()) is called right after start_other_cpus() 381 * when all CPUs are brought online and all distance information is available. 382 * 383 * When lgrp_main_init() is complete it sets lgrp_initialized. The 384 * lgrp_main_mp_init() sets lgrp_topo_initialized. 385 */ 386 387 /* 388 * true when lgrp initialization has been completed. 389 */ 390 int lgrp_initialized = 0; 391 392 /* 393 * True when lgrp topology is constructed. 394 */ 395 int lgrp_topo_initialized = 0; 396 397 /* 398 * Init routine called after startup(), /etc/system has been processed, 399 * and cpu0 has been added to an lgroup. 400 */ 401 void 402 lgrp_main_init(void) 403 { 404 cpu_t *cp = CPU; 405 lgrp_id_t lgrpid; 406 int i; 407 /* 408 * Enforce a valid lgrp_mem_default_policy 409 */ 410 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) || 411 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES)) 412 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT; 413 414 /* 415 * See if mpo should be disabled. 416 * This may happen in the case of null proc LPA on Starcat. 417 * The platform won't be able to detect null proc LPA until after 418 * cpu0 and memory have already been added to lgroups. 419 * When and if it is detected, the Starcat platform will return 420 * a different platform handle for cpu0 which is what we check for 421 * here. If mpo should be disabled move cpu0 to it's rightful place 422 * (the root), and destroy the remaining lgroups. This effectively 423 * provides an UMA lgroup topology. 424 */ 425 lgrpid = cp->cpu_lpl->lpl_lgrpid; 426 if (lgrp_table[lgrpid]->lgrp_plathand != 427 lgrp_plat_cpu_to_hand(cp->cpu_id)) { 428 lgrp_part_del_cpu(cp); 429 lgrp_cpu_fini(cp, lgrpid); 430 431 lgrp_cpu_init(cp); 432 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 433 434 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID); 435 436 /* 437 * Destroy all lgroups except for root 438 */ 439 for (i = 0; i <= lgrp_alloc_max; i++) { 440 if (LGRP_EXISTS(lgrp_table[i]) && 441 lgrp_table[i] != lgrp_root) 442 lgrp_destroy(lgrp_table[i]); 443 } 444 445 /* 446 * Fix up root to point at itself for leaves and resources 447 * and not have any children 448 */ 449 lgrp_root->lgrp_childcnt = 0; 450 klgrpset_clear(lgrp_root->lgrp_children); 451 klgrpset_clear(lgrp_root->lgrp_leaves); 452 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID); 453 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]); 454 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID); 455 } 456 457 /* 458 * Initialize kstats framework. 459 */ 460 lgrp_kstat_init(); 461 /* 462 * cpu0 is finally where it should be, so create it's lgroup's kstats 463 */ 464 mutex_enter(&cpu_lock); 465 lgrp_kstat_create(cp); 466 mutex_exit(&cpu_lock); 467 468 lgrp_plat_main_init(); 469 lgrp_initialized = 1; 470 } 471 472 /* 473 * Finish lgrp initialization after all CPUS are brought on-line. 474 * This routine is called after start_other_cpus(). 475 */ 476 void 477 lgrp_main_mp_init(void) 478 { 479 klgrpset_t changed; 480 481 /* 482 * Update lgroup topology (if necessary) 483 */ 484 klgrpset_clear(changed); 485 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed); 486 lgrp_topo_initialized = 1; 487 } 488 489 /* 490 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.) 491 */ 492 void 493 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where) 494 { 495 klgrpset_t changed; 496 cpu_t *cp; 497 lgrp_id_t id; 498 int rc; 499 500 switch (event) { 501 /* 502 * The following (re)configuration events are common code 503 * initiated. lgrp_plat_config() is called here to inform the 504 * platform of the reconfiguration event. 505 */ 506 case LGRP_CONFIG_CPU_ADD: 507 cp = (cpu_t *)resource; 508 509 /* 510 * Initialize the new CPU's lgrp related next/prev 511 * links, and give it a bootstrap lpl so that it can 512 * survive should it need to enter the dispatcher. 513 */ 514 cp->cpu_next_lpl = cp; 515 cp->cpu_prev_lpl = cp; 516 cp->cpu_next_lgrp = cp; 517 cp->cpu_prev_lgrp = cp; 518 cp->cpu_lpl = lpl_bootstrap; 519 520 lgrp_plat_config(event, resource); 521 atomic_add_32(&lgrp_gen, 1); 522 523 break; 524 case LGRP_CONFIG_CPU_DEL: 525 lgrp_plat_config(event, resource); 526 atomic_add_32(&lgrp_gen, 1); 527 528 break; 529 case LGRP_CONFIG_CPU_ONLINE: 530 cp = (cpu_t *)resource; 531 lgrp_cpu_init(cp); 532 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid); 533 rc = lpl_topo_verify(cp->cpu_part); 534 if (rc != LPL_TOPO_CORRECT) { 535 panic("lpl_topo_verify failed: %d", rc); 536 } 537 lgrp_plat_config(event, resource); 538 atomic_add_32(&lgrp_gen, 1); 539 540 break; 541 case LGRP_CONFIG_CPU_OFFLINE: 542 cp = (cpu_t *)resource; 543 id = cp->cpu_lpl->lpl_lgrpid; 544 lgrp_part_del_cpu(cp); 545 lgrp_cpu_fini(cp, id); 546 rc = lpl_topo_verify(cp->cpu_part); 547 if (rc != LPL_TOPO_CORRECT) { 548 panic("lpl_topo_verify failed: %d", rc); 549 } 550 lgrp_plat_config(event, resource); 551 atomic_add_32(&lgrp_gen, 1); 552 553 break; 554 case LGRP_CONFIG_CPUPART_ADD: 555 cp = (cpu_t *)resource; 556 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where); 557 rc = lpl_topo_verify(cp->cpu_part); 558 if (rc != LPL_TOPO_CORRECT) { 559 panic("lpl_topo_verify failed: %d", rc); 560 } 561 lgrp_plat_config(event, resource); 562 563 break; 564 case LGRP_CONFIG_CPUPART_DEL: 565 cp = (cpu_t *)resource; 566 lgrp_part_del_cpu((cpu_t *)resource); 567 rc = lpl_topo_verify(cp->cpu_part); 568 if (rc != LPL_TOPO_CORRECT) { 569 panic("lpl_topo_verify failed: %d", rc); 570 } 571 lgrp_plat_config(event, resource); 572 573 break; 574 /* 575 * The following events are initiated by the memnode 576 * subsystem. 577 */ 578 case LGRP_CONFIG_MEM_ADD: 579 lgrp_mem_init((int)resource, where, B_FALSE); 580 atomic_add_32(&lgrp_gen, 1); 581 582 break; 583 case LGRP_CONFIG_MEM_DEL: 584 lgrp_mem_fini((int)resource, where, B_FALSE); 585 atomic_add_32(&lgrp_gen, 1); 586 587 break; 588 case LGRP_CONFIG_MEM_RENAME: { 589 lgrp_config_mem_rename_t *ren_arg = 590 (lgrp_config_mem_rename_t *)where; 591 592 lgrp_mem_rename((int)resource, 593 ren_arg->lmem_rename_from, 594 ren_arg->lmem_rename_to); 595 atomic_add_32(&lgrp_gen, 1); 596 597 break; 598 } 599 case LGRP_CONFIG_GEN_UPDATE: 600 atomic_add_32(&lgrp_gen, 1); 601 602 break; 603 case LGRP_CONFIG_FLATTEN: 604 if (where == 0) 605 lgrp_topo_levels = (int)resource; 606 else 607 (void) lgrp_topo_flatten(resource, 608 lgrp_table, lgrp_alloc_max, &changed); 609 610 break; 611 /* 612 * Initiated by platform latency probing code 613 */ 614 case LGRP_CONFIG_LATENCY_CHANGE: 615 lgrp_latency_change((u_longlong_t)resource, 616 (u_longlong_t)where); 617 618 break; 619 case LGRP_CONFIG_NOP: 620 621 break; 622 default: 623 break; 624 } 625 626 } 627 628 /* 629 * Called to add lgrp info into cpu structure from cpu_add_unit; 630 * do not assume cpu is in cpu[] yet! 631 * 632 * CPUs are brought online with all other CPUs paused so we can't 633 * allocate memory or we could deadlock the system, so we rely on 634 * the platform to statically allocate as much space as we need 635 * for the lgrp structs and stats. 636 */ 637 static void 638 lgrp_cpu_init(struct cpu *cp) 639 { 640 klgrpset_t changed; 641 int count; 642 lgrp_handle_t hand; 643 int first_cpu; 644 lgrp_t *my_lgrp; 645 lgrp_id_t lgrpid; 646 struct cpu *cptr; 647 struct chip *chp; 648 649 /* 650 * This is the first time through if the resource set 651 * for the root lgroup is empty. After cpu0 has been 652 * initially added to an lgroup, the root's CPU resource 653 * set can never be empty, since the system's last CPU 654 * cannot be offlined. 655 */ 656 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) { 657 /* 658 * First time through. 659 */ 660 first_cpu = 1; 661 } else { 662 /* 663 * If cpu0 needs to move lgroups, we may come 664 * through here again, at which time cpu_lock won't 665 * be held, and lgrp_initialized will be false. 666 */ 667 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 668 ASSERT(cp->cpu_part != NULL); 669 first_cpu = 0; 670 } 671 672 hand = lgrp_plat_cpu_to_hand(cp->cpu_id); 673 my_lgrp = lgrp_hand_to_lgrp(hand); 674 675 if (my_lgrp == NULL) { 676 /* 677 * Create new lgrp and add it to lgroup topology 678 */ 679 my_lgrp = lgrp_create(); 680 my_lgrp->lgrp_plathand = hand; 681 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 682 lgrpid = my_lgrp->lgrp_id; 683 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 684 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 685 686 count = 0; 687 klgrpset_clear(changed); 688 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 689 &changed); 690 /* 691 * May have added new intermediate lgroups, so need to add 692 * resources other than CPUs which are added below 693 */ 694 (void) lgrp_mnode_update(changed, NULL); 695 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 696 > 0) { 697 /* 698 * Leaf lgroup was created, but latency wasn't available 699 * then. So, set latency for it and fill in rest of lgroup 700 * topology now that we know how far it is from other leaf 701 * lgroups. 702 */ 703 lgrpid = my_lgrp->lgrp_id; 704 klgrpset_clear(changed); 705 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 706 lgrpid)) 707 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 708 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 709 &changed); 710 711 /* 712 * May have added new intermediate lgroups, so need to add 713 * resources other than CPUs which are added below 714 */ 715 (void) lgrp_mnode_update(changed, NULL); 716 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU], 717 my_lgrp->lgrp_id)) { 718 int i; 719 720 /* 721 * Update existing lgroup and lgroups containing it with CPU 722 * resource 723 */ 724 lgrpid = my_lgrp->lgrp_id; 725 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 726 for (i = 0; i <= lgrp_alloc_max; i++) { 727 lgrp_t *lgrp; 728 729 lgrp = lgrp_table[i]; 730 if (!LGRP_EXISTS(lgrp) || 731 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 732 continue; 733 734 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 735 } 736 } 737 738 lgrpid = my_lgrp->lgrp_id; 739 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid]; 740 741 /* 742 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will 743 * end up in lpl for lgroup 0 whether it is supposed to be in there or 744 * not since none of lgroup IDs in the lpl's have been set yet. 745 */ 746 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid) 747 cp->cpu_lpl->lpl_lgrpid = lgrpid; 748 749 /* 750 * link the CPU into the lgrp's CPU list 751 */ 752 if (my_lgrp->lgrp_cpucnt == 0) { 753 my_lgrp->lgrp_cpu = cp; 754 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp; 755 } else { 756 cptr = my_lgrp->lgrp_cpu; 757 cp->cpu_next_lgrp = cptr; 758 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp; 759 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp; 760 cptr->cpu_prev_lgrp = cp; 761 } 762 my_lgrp->lgrp_cpucnt++; 763 764 /* 765 * Add this cpu's chip to the per lgroup list 766 * if necessary 767 */ 768 if (cp->cpu_chip->chip_lgrp == NULL) { 769 struct chip *lcpr; 770 771 chp = cp->cpu_chip; 772 773 if (my_lgrp->lgrp_chipcnt == 0) { 774 my_lgrp->lgrp_chips = chp; 775 chp->chip_next_lgrp = 776 chp->chip_prev_lgrp = chp; 777 } else { 778 lcpr = my_lgrp->lgrp_chips; 779 chp->chip_next_lgrp = lcpr; 780 chp->chip_prev_lgrp = 781 lcpr->chip_prev_lgrp; 782 lcpr->chip_prev_lgrp->chip_next_lgrp = 783 chp; 784 lcpr->chip_prev_lgrp = chp; 785 } 786 chp->chip_lgrp = my_lgrp; 787 chp->chip_balance = chp->chip_next_lgrp; 788 my_lgrp->lgrp_chipcnt++; 789 } 790 } 791 792 lgrp_t * 793 lgrp_create(void) 794 { 795 lgrp_t *my_lgrp; 796 lgrp_id_t lgrpid; 797 int i; 798 799 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 800 801 /* 802 * Find an open slot in the lgroup table and recycle unused lgroup 803 * left there if any 804 */ 805 my_lgrp = NULL; 806 if (lgrp_alloc_hint == -1) 807 /* 808 * Allocate from end when hint not set yet because no lgroups 809 * have been deleted yet 810 */ 811 lgrpid = nlgrps++; 812 else { 813 /* 814 * Start looking for next open slot from hint and leave hint 815 * at slot allocated 816 */ 817 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) { 818 my_lgrp = lgrp_table[i]; 819 if (!LGRP_EXISTS(my_lgrp)) { 820 lgrpid = i; 821 nlgrps++; 822 break; 823 } 824 } 825 lgrp_alloc_hint = lgrpid; 826 } 827 828 /* 829 * Keep track of max lgroup ID allocated so far to cut down on searches 830 */ 831 if (lgrpid > lgrp_alloc_max) 832 lgrp_alloc_max = lgrpid; 833 834 /* 835 * Need to allocate new lgroup if next open slot didn't have one 836 * for recycling 837 */ 838 if (my_lgrp == NULL) 839 my_lgrp = lgrp_plat_alloc(lgrpid); 840 841 if (nlgrps > nlgrpsmax || my_lgrp == NULL) 842 panic("Too many lgrps for platform (%d)", nlgrps); 843 844 my_lgrp->lgrp_id = lgrpid; 845 my_lgrp->lgrp_latency = 0; 846 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 847 my_lgrp->lgrp_parent = NULL; 848 my_lgrp->lgrp_childcnt = 0; 849 my_lgrp->lgrp_mnodes = (mnodeset_t)0; 850 my_lgrp->lgrp_nmnodes = 0; 851 klgrpset_clear(my_lgrp->lgrp_children); 852 klgrpset_clear(my_lgrp->lgrp_leaves); 853 for (i = 0; i < LGRP_RSRC_COUNT; i++) 854 klgrpset_clear(my_lgrp->lgrp_set[i]); 855 856 my_lgrp->lgrp_cpu = NULL; 857 my_lgrp->lgrp_cpucnt = 0; 858 my_lgrp->lgrp_chips = NULL; 859 my_lgrp->lgrp_chipcnt = 0; 860 861 if (my_lgrp->lgrp_kstat != NULL) 862 lgrp_kstat_reset(lgrpid); 863 864 lgrp_table[my_lgrp->lgrp_id] = my_lgrp; 865 866 return (my_lgrp); 867 } 868 869 void 870 lgrp_destroy(lgrp_t *lgrp) 871 { 872 int i; 873 874 /* 875 * Unless this lgroup is being destroyed on behalf of 876 * the boot CPU, cpu_lock must be held 877 */ 878 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 879 880 if (nlgrps == 1) 881 cmn_err(CE_PANIC, "Can't destroy only lgroup!"); 882 883 if (!LGRP_EXISTS(lgrp)) 884 return; 885 886 /* 887 * Set hint to lgroup being deleted and try to keep lower numbered 888 * hints to facilitate finding empty slots 889 */ 890 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint) 891 lgrp_alloc_hint = lgrp->lgrp_id; 892 893 /* 894 * Mark this lgroup to be recycled by setting its lgroup ID to 895 * LGRP_NONE and clear relevant fields 896 */ 897 lgrp->lgrp_id = LGRP_NONE; 898 lgrp->lgrp_latency = 0; 899 lgrp->lgrp_plathand = LGRP_NULL_HANDLE; 900 lgrp->lgrp_parent = NULL; 901 lgrp->lgrp_childcnt = 0; 902 903 klgrpset_clear(lgrp->lgrp_children); 904 klgrpset_clear(lgrp->lgrp_leaves); 905 for (i = 0; i < LGRP_RSRC_COUNT; i++) 906 klgrpset_clear(lgrp->lgrp_set[i]); 907 908 lgrp->lgrp_mnodes = (mnodeset_t)0; 909 lgrp->lgrp_nmnodes = 0; 910 911 lgrp->lgrp_cpu = NULL; 912 lgrp->lgrp_cpucnt = 0; 913 lgrp->lgrp_chipcnt = 0; 914 lgrp->lgrp_chips = NULL; 915 916 nlgrps--; 917 } 918 919 /* 920 * Initialize kstat data. Called from lgrp intialization code. 921 */ 922 static void 923 lgrp_kstat_init(void) 924 { 925 lgrp_stat_t stat; 926 927 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL); 928 929 for (stat = 0; stat < LGRP_NUM_STATS; stat++) 930 kstat_named_init(&lgrp_kstat_data[stat], 931 lgrp_kstat_names[stat], KSTAT_DATA_INT64); 932 } 933 934 /* 935 * initialize an lgrp's kstats if needed 936 * called with cpu_lock held but not with cpus paused. 937 * we don't tear these down now because we don't know about 938 * memory leaving the lgrp yet... 939 */ 940 941 void 942 lgrp_kstat_create(cpu_t *cp) 943 { 944 kstat_t *lgrp_kstat; 945 lgrp_id_t lgrpid; 946 lgrp_t *my_lgrp; 947 948 ASSERT(MUTEX_HELD(&cpu_lock)); 949 950 lgrpid = cp->cpu_lpl->lpl_lgrpid; 951 my_lgrp = lgrp_table[lgrpid]; 952 953 if (my_lgrp->lgrp_kstat != NULL) 954 return; /* already initialized */ 955 956 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc", 957 KSTAT_TYPE_NAMED, LGRP_NUM_STATS, 958 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); 959 960 if (lgrp_kstat != NULL) { 961 lgrp_kstat->ks_lock = &lgrp_kstat_mutex; 962 lgrp_kstat->ks_private = my_lgrp; 963 lgrp_kstat->ks_data = &lgrp_kstat_data; 964 lgrp_kstat->ks_update = lgrp_kstat_extract; 965 my_lgrp->lgrp_kstat = lgrp_kstat; 966 kstat_install(lgrp_kstat); 967 } 968 } 969 970 /* 971 * this will do something when we manage to remove now unused lgrps 972 */ 973 974 /* ARGSUSED */ 975 void 976 lgrp_kstat_destroy(cpu_t *cp) 977 { 978 ASSERT(MUTEX_HELD(&cpu_lock)); 979 } 980 981 /* 982 * Called when a CPU is off-lined. 983 */ 984 static void 985 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid) 986 { 987 lgrp_t *my_lgrp; 988 struct cpu *prev; 989 struct cpu *next; 990 chip_t *chp; 991 992 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 993 994 prev = cp->cpu_prev_lgrp; 995 next = cp->cpu_next_lgrp; 996 997 prev->cpu_next_lgrp = next; 998 next->cpu_prev_lgrp = prev; 999 1000 /* 1001 * just because I'm paranoid doesn't mean... 1002 */ 1003 1004 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL; 1005 1006 my_lgrp = lgrp_table[lgrpid]; 1007 my_lgrp->lgrp_cpucnt--; 1008 1009 /* 1010 * If the last CPU on it's chip is being offlined 1011 * then remove this chip from the per lgroup list. 1012 * 1013 * This is also done for the boot CPU when it needs 1014 * to move between lgroups as a consequence of 1015 * null proc lpa. 1016 */ 1017 chp = cp->cpu_chip; 1018 if (chp->chip_ncpu == 0 || !lgrp_initialized) { 1019 1020 chip_t *chpp; 1021 1022 if (--my_lgrp->lgrp_chipcnt == 0) 1023 my_lgrp->lgrp_chips = NULL; 1024 else if (my_lgrp->lgrp_chips == chp) 1025 my_lgrp->lgrp_chips = chp->chip_next_lgrp; 1026 1027 /* 1028 * Walk this lgroup's chip list looking for chips that 1029 * may try to balance against the one that's leaving 1030 */ 1031 for (chpp = chp->chip_next_lgrp; chpp != chp; 1032 chpp = chpp->chip_next_lgrp) { 1033 if (chpp->chip_balance == chp) 1034 chpp->chip_balance = chp->chip_next_lgrp; 1035 } 1036 1037 chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp; 1038 chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp; 1039 1040 chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL; 1041 chp->chip_lgrp = NULL; 1042 chp->chip_balance = NULL; 1043 } 1044 1045 /* 1046 * Removing last CPU in lgroup, so update lgroup topology 1047 */ 1048 if (my_lgrp->lgrp_cpucnt == 0) { 1049 klgrpset_t changed; 1050 int count; 1051 int i; 1052 1053 my_lgrp->lgrp_cpu = NULL; 1054 1055 /* 1056 * Remove this lgroup from its lgroup CPU resources and remove 1057 * lgroup from lgroup topology if it doesn't have any more 1058 * resources in it now 1059 */ 1060 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1061 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1062 count = 0; 1063 klgrpset_clear(changed); 1064 count += lgrp_leaf_delete(my_lgrp, lgrp_table, 1065 lgrp_alloc_max + 1, &changed); 1066 return; 1067 } 1068 1069 /* 1070 * This lgroup isn't empty, so just remove it from CPU 1071 * resources of any lgroups that contain it as such 1072 */ 1073 for (i = 0; i <= lgrp_alloc_max; i++) { 1074 lgrp_t *lgrp; 1075 1076 lgrp = lgrp_table[i]; 1077 if (!LGRP_EXISTS(lgrp) || 1078 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU], 1079 lgrpid)) 1080 continue; 1081 1082 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid); 1083 } 1084 return; 1085 } 1086 1087 if (my_lgrp->lgrp_cpu == cp) 1088 my_lgrp->lgrp_cpu = next; 1089 1090 } 1091 1092 /* 1093 * Update memory nodes in target lgroups and return ones that get changed 1094 */ 1095 int 1096 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed) 1097 { 1098 int count; 1099 int i; 1100 int j; 1101 lgrp_t *lgrp; 1102 lgrp_t *lgrp_rsrc; 1103 1104 count = 0; 1105 if (changed) 1106 klgrpset_clear(*changed); 1107 1108 if (klgrpset_isempty(target)) 1109 return (0); 1110 1111 /* 1112 * Find each lgroup in target lgroups 1113 */ 1114 for (i = 0; i <= lgrp_alloc_max; i++) { 1115 /* 1116 * Skip any lgroups that don't exist or aren't in target group 1117 */ 1118 lgrp = lgrp_table[i]; 1119 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) { 1120 continue; 1121 } 1122 1123 /* 1124 * Initialize memnodes for intermediate lgroups to 0 1125 * and update them from scratch since they may have completely 1126 * changed 1127 */ 1128 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) { 1129 lgrp->lgrp_mnodes = (mnodeset_t)0; 1130 lgrp->lgrp_nmnodes = 0; 1131 } 1132 1133 /* 1134 * Update memory nodes of of target lgroup with memory nodes 1135 * from each lgroup in its lgroup memory resource set 1136 */ 1137 for (j = 0; j <= lgrp_alloc_max; j++) { 1138 int k; 1139 1140 /* 1141 * Skip any lgroups that don't exist or aren't in 1142 * memory resources of target lgroup 1143 */ 1144 lgrp_rsrc = lgrp_table[j]; 1145 if (!LGRP_EXISTS(lgrp_rsrc) || 1146 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1147 j)) 1148 continue; 1149 1150 /* 1151 * Update target lgroup's memnodes to include memnodes 1152 * of this lgroup 1153 */ 1154 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) { 1155 mnodeset_t mnode_mask; 1156 1157 mnode_mask = (mnodeset_t)1 << k; 1158 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) && 1159 !(lgrp->lgrp_mnodes & mnode_mask)) { 1160 lgrp->lgrp_mnodes |= mnode_mask; 1161 lgrp->lgrp_nmnodes++; 1162 } 1163 } 1164 count++; 1165 if (changed) 1166 klgrpset_add(*changed, lgrp->lgrp_id); 1167 } 1168 } 1169 1170 return (count); 1171 } 1172 1173 /* 1174 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory 1175 * is moved from one board to another. The "from" and "to" arguments specify the 1176 * source and the destination of the move. 1177 * 1178 * See plat_lgrp_config() for a detailed description of the copy-rename 1179 * semantics. 1180 * 1181 * The lgrp_mem_rename() is called by the platform copy-rename code to update 1182 * the lgroup topology which is changing as memory moves from one lgroup to 1183 * another. It removes the mnode from the source lgroup and re-inserts it in the 1184 * target lgroup. 1185 * 1186 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and 1187 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR 1188 * copy-rename operation. 1189 * 1190 * There is one case which requires special handling. If the system contains 1191 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the 1192 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by 1193 * lgrp_mem_init), but there is a window when the system has no memory in the 1194 * lgroup hierarchy. If another thread tries to allocate memory during this 1195 * window, the allocation will fail, although the system has physical memory. 1196 * This may cause a system panic or a deadlock (some sleeping memory allocations 1197 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting 1198 * the mnode back). 1199 * 1200 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the 1201 * lgrp with non-empty lgrp_mnodes. To deal with the special case above, 1202 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes, 1203 * but it updates the rest of the lgroup topology as if the mnode was actually 1204 * removed. The lgrp_mem_init() function recognizes that the mnode being 1205 * inserted represents such a special case and updates the topology 1206 * appropriately. 1207 */ 1208 void 1209 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to) 1210 { 1211 /* 1212 * Remove the memory from the source node and add it to the destination 1213 * node. 1214 */ 1215 lgrp_mem_fini(mnode, from, B_TRUE); 1216 lgrp_mem_init(mnode, to, B_TRUE); 1217 } 1218 1219 /* 1220 * Called to indicate that the lgrp with platform handle "hand" now 1221 * contains the memory identified by "mnode". 1222 * 1223 * LOCKING for this routine is a bit tricky. Usually it is called without 1224 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1225 * callers. During DR of the board containing the caged memory it may be called 1226 * with cpu_lock already held and CPUs paused. 1227 * 1228 * If the insertion is part of the DR copy-rename and the inserted mnode (and 1229 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are 1230 * dealing with the special case of DR copy-rename described in 1231 * lgrp_mem_rename(). 1232 */ 1233 void 1234 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1235 { 1236 klgrpset_t changed; 1237 int count; 1238 int i; 1239 lgrp_t *my_lgrp; 1240 lgrp_id_t lgrpid; 1241 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode); 1242 boolean_t drop_lock = B_FALSE; 1243 boolean_t need_synch = B_FALSE; 1244 1245 /* 1246 * Grab CPU lock (if we haven't already) 1247 */ 1248 if (!MUTEX_HELD(&cpu_lock)) { 1249 mutex_enter(&cpu_lock); 1250 drop_lock = B_TRUE; 1251 } 1252 1253 /* 1254 * This routine may be called from a context where we already 1255 * hold cpu_lock, and have already paused cpus. 1256 */ 1257 if (!cpus_paused()) 1258 need_synch = B_TRUE; 1259 1260 /* 1261 * Check if this mnode is already configured and return immediately if 1262 * it is. 1263 * 1264 * NOTE: in special case of copy-rename of the only remaining mnode, 1265 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we 1266 * recognize this case and continue as usual, but skip the update to 1267 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency 1268 * in topology, temporarily introduced by lgrp_mem_fini(). 1269 */ 1270 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) && 1271 lgrp_root->lgrp_mnodes & mnodes_mask) { 1272 if (drop_lock) 1273 mutex_exit(&cpu_lock); 1274 return; 1275 } 1276 1277 /* 1278 * Update lgroup topology with new memory resources, keeping track of 1279 * which lgroups change 1280 */ 1281 count = 0; 1282 klgrpset_clear(changed); 1283 my_lgrp = lgrp_hand_to_lgrp(hand); 1284 if (my_lgrp == NULL) { 1285 /* new lgrp */ 1286 my_lgrp = lgrp_create(); 1287 lgrpid = my_lgrp->lgrp_id; 1288 my_lgrp->lgrp_plathand = hand; 1289 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand); 1290 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid); 1291 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1292 1293 if (need_synch) 1294 pause_cpus(NULL); 1295 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1296 &changed); 1297 if (need_synch) 1298 start_cpus(); 1299 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand) 1300 > 0) { 1301 /* 1302 * Leaf lgroup was created, but latency wasn't available 1303 * then. So, set latency for it and fill in rest of lgroup 1304 * topology now that we know how far it is from other leaf 1305 * lgroups. 1306 */ 1307 klgrpset_clear(changed); 1308 lgrpid = my_lgrp->lgrp_id; 1309 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1310 lgrpid)) 1311 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1312 if (need_synch) 1313 pause_cpus(NULL); 1314 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1, 1315 &changed); 1316 if (need_synch) 1317 start_cpus(); 1318 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM], 1319 my_lgrp->lgrp_id)) { 1320 /* 1321 * Add new lgroup memory resource to existing lgroup 1322 */ 1323 lgrpid = my_lgrp->lgrp_id; 1324 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1325 klgrpset_add(changed, lgrpid); 1326 count++; 1327 for (i = 0; i <= lgrp_alloc_max; i++) { 1328 lgrp_t *lgrp; 1329 1330 lgrp = lgrp_table[i]; 1331 if (!LGRP_EXISTS(lgrp) || 1332 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid)) 1333 continue; 1334 1335 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1336 klgrpset_add(changed, lgrp->lgrp_id); 1337 count++; 1338 } 1339 } 1340 1341 /* 1342 * Add memory node to lgroup and remove lgroup from ones that need 1343 * to be updated 1344 */ 1345 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) { 1346 my_lgrp->lgrp_mnodes |= mnodes_mask; 1347 my_lgrp->lgrp_nmnodes++; 1348 } 1349 klgrpset_del(changed, lgrpid); 1350 1351 /* 1352 * Update memory node information for all lgroups that changed and 1353 * contain new memory node as a resource 1354 */ 1355 if (count) 1356 (void) lgrp_mnode_update(changed, NULL); 1357 1358 if (drop_lock) 1359 mutex_exit(&cpu_lock); 1360 } 1361 1362 /* 1363 * Called to indicate that the lgroup associated with the platform 1364 * handle "hand" no longer contains given memory node 1365 * 1366 * LOCKING for this routine is a bit tricky. Usually it is called without 1367 * cpu_lock and it must must grab cpu_lock here to prevent racing with other 1368 * callers. During DR of the board containing the caged memory it may be called 1369 * with cpu_lock already held and CPUs paused. 1370 * 1371 * If the deletion is part of the DR copy-rename and the deleted mnode is the 1372 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated, 1373 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert 1374 * the same mnode back into the topology. See lgrp_mem_rename() and 1375 * lgrp_mem_init() for additional details. 1376 */ 1377 void 1378 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename) 1379 { 1380 klgrpset_t changed; 1381 int count; 1382 int i; 1383 lgrp_t *my_lgrp; 1384 lgrp_id_t lgrpid; 1385 mnodeset_t mnodes_mask; 1386 boolean_t drop_lock = B_FALSE; 1387 boolean_t need_synch = B_FALSE; 1388 1389 /* 1390 * Grab CPU lock (if we haven't already) 1391 */ 1392 if (!MUTEX_HELD(&cpu_lock)) { 1393 mutex_enter(&cpu_lock); 1394 drop_lock = B_TRUE; 1395 } 1396 1397 /* 1398 * This routine may be called from a context where we already 1399 * hold cpu_lock and have already paused cpus. 1400 */ 1401 if (!cpus_paused()) 1402 need_synch = B_TRUE; 1403 1404 my_lgrp = lgrp_hand_to_lgrp(hand); 1405 1406 /* 1407 * The lgrp *must* be pre-existing 1408 */ 1409 ASSERT(my_lgrp != NULL); 1410 1411 /* 1412 * Delete memory node from lgroups which contain it 1413 */ 1414 mnodes_mask = ((mnodeset_t)1 << mnode); 1415 for (i = 0; i <= lgrp_alloc_max; i++) { 1416 lgrp_t *lgrp = lgrp_table[i]; 1417 /* 1418 * Skip any non-existent lgroups and any lgroups that don't 1419 * contain leaf lgroup of memory as a memory resource 1420 */ 1421 if (!LGRP_EXISTS(lgrp) || 1422 !(lgrp->lgrp_mnodes & mnodes_mask)) 1423 continue; 1424 1425 /* 1426 * Avoid removing the last mnode from the root in the DR 1427 * copy-rename case. See lgrp_mem_rename() for details. 1428 */ 1429 if (is_copy_rename && 1430 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask)) 1431 continue; 1432 1433 /* 1434 * Remove memory node from lgroup. 1435 */ 1436 lgrp->lgrp_mnodes &= ~mnodes_mask; 1437 lgrp->lgrp_nmnodes--; 1438 ASSERT(lgrp->lgrp_nmnodes >= 0); 1439 } 1440 ASSERT(lgrp_root->lgrp_nmnodes > 0); 1441 1442 /* 1443 * Don't need to update lgroup topology if this lgroup still has memory. 1444 * 1445 * In the special case of DR copy-rename with the only mnode being 1446 * removed, the lgrp_mnodes for the root is always non-zero, but we 1447 * still need to update the lgroup topology. 1448 */ 1449 if ((my_lgrp->lgrp_nmnodes > 0) && 1450 !(is_copy_rename && 1451 (my_lgrp == lgrp_root) && 1452 (my_lgrp->lgrp_mnodes == mnodes_mask))) { 1453 if (drop_lock) 1454 mutex_exit(&cpu_lock); 1455 return; 1456 } 1457 1458 /* 1459 * This lgroup does not contain any memory now 1460 */ 1461 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]); 1462 1463 /* 1464 * Remove this lgroup from lgroup topology if it does not contain any 1465 * resources now 1466 */ 1467 lgrpid = my_lgrp->lgrp_id; 1468 count = 0; 1469 klgrpset_clear(changed); 1470 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) { 1471 /* 1472 * Delete lgroup when no more resources 1473 */ 1474 if (need_synch) 1475 pause_cpus(NULL); 1476 count = lgrp_leaf_delete(my_lgrp, lgrp_table, 1477 lgrp_alloc_max + 1, &changed); 1478 ASSERT(count > 0); 1479 if (need_synch) 1480 start_cpus(); 1481 } else { 1482 /* 1483 * Remove lgroup from memory resources of any lgroups that 1484 * contain it as such 1485 */ 1486 for (i = 0; i <= lgrp_alloc_max; i++) { 1487 lgrp_t *lgrp; 1488 1489 lgrp = lgrp_table[i]; 1490 if (!LGRP_EXISTS(lgrp) || 1491 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], 1492 lgrpid)) 1493 continue; 1494 1495 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid); 1496 } 1497 } 1498 if (drop_lock) 1499 mutex_exit(&cpu_lock); 1500 } 1501 1502 /* 1503 * Return lgroup with given platform handle 1504 */ 1505 lgrp_t * 1506 lgrp_hand_to_lgrp(lgrp_handle_t hand) 1507 { 1508 int i; 1509 lgrp_t *lgrp; 1510 1511 if (hand == LGRP_NULL_HANDLE) 1512 return (NULL); 1513 1514 for (i = 0; i <= lgrp_alloc_max; i++) { 1515 lgrp = lgrp_table[i]; 1516 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1517 return (lgrp); 1518 } 1519 return (NULL); 1520 } 1521 1522 /* 1523 * Return the home lgroup of the current thread. 1524 * We must do this with kernel preemption disabled, since we don't want our 1525 * thread to be re-homed while we're poking around with its lpl, and the lpl 1526 * should never be NULL. 1527 * 1528 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption 1529 * is enabled because of DR. Callers can use disable kernel preemption 1530 * around this call to guarantee that the lgroup will be valid beyond this 1531 * routine, since kernel preemption can be recursive. 1532 */ 1533 lgrp_t * 1534 lgrp_home_lgrp(void) 1535 { 1536 lgrp_t *lgrp; 1537 lpl_t *lpl; 1538 1539 kpreempt_disable(); 1540 1541 lpl = curthread->t_lpl; 1542 ASSERT(lpl != NULL); 1543 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1544 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid])); 1545 lgrp = lgrp_table[lpl->lpl_lgrpid]; 1546 1547 kpreempt_enable(); 1548 1549 return (lgrp); 1550 } 1551 1552 /* 1553 * Return ID of home lgroup for given thread 1554 * (See comments for lgrp_home_lgrp() for special care and handling 1555 * instructions) 1556 */ 1557 lgrp_id_t 1558 lgrp_home_id(kthread_t *t) 1559 { 1560 lgrp_id_t lgrp; 1561 lpl_t *lpl; 1562 1563 ASSERT(t != NULL); 1564 /* 1565 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we 1566 * cannot since the HAT layer can call into this routine to 1567 * determine the locality for its data structures in the context 1568 * of a page fault. 1569 */ 1570 1571 kpreempt_disable(); 1572 1573 lpl = t->t_lpl; 1574 ASSERT(lpl != NULL); 1575 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max); 1576 lgrp = lpl->lpl_lgrpid; 1577 1578 kpreempt_enable(); 1579 1580 return (lgrp); 1581 } 1582 1583 /* 1584 * Return lgroup containing the physical memory for the given page frame number 1585 */ 1586 lgrp_t * 1587 lgrp_pfn_to_lgrp(pfn_t pfn) 1588 { 1589 lgrp_handle_t hand; 1590 int i; 1591 lgrp_t *lgrp; 1592 1593 hand = lgrp_plat_pfn_to_hand(pfn); 1594 if (hand != LGRP_NULL_HANDLE) 1595 for (i = 0; i <= lgrp_alloc_max; i++) { 1596 lgrp = lgrp_table[i]; 1597 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1598 return (lgrp); 1599 } 1600 return (NULL); 1601 } 1602 1603 /* 1604 * Return lgroup containing the physical memory for the given page frame number 1605 */ 1606 lgrp_t * 1607 lgrp_phys_to_lgrp(u_longlong_t physaddr) 1608 { 1609 lgrp_handle_t hand; 1610 int i; 1611 lgrp_t *lgrp; 1612 pfn_t pfn; 1613 1614 pfn = btop(physaddr); 1615 hand = lgrp_plat_pfn_to_hand(pfn); 1616 if (hand != LGRP_NULL_HANDLE) 1617 for (i = 0; i <= lgrp_alloc_max; i++) { 1618 lgrp = lgrp_table[i]; 1619 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand) 1620 return (lgrp); 1621 } 1622 return (NULL); 1623 } 1624 1625 /* 1626 * Return the leaf lgroup containing the given CPU 1627 * 1628 * The caller needs to take precautions necessary to prevent 1629 * "cpu" from going away across a call to this function. 1630 * hint: kpreempt_disable()/kpreempt_enable() 1631 */ 1632 static lgrp_t * 1633 lgrp_cpu_to_lgrp(cpu_t *cpu) 1634 { 1635 return (cpu->cpu_chip->chip_lgrp); 1636 } 1637 1638 /* 1639 * Return the sum of the partition loads in an lgrp divided by 1640 * the number of CPUs in the lgrp. This is our best approximation 1641 * of an 'lgroup load average' for a useful per-lgroup kstat. 1642 */ 1643 static uint64_t 1644 lgrp_sum_loadavgs(lgrp_t *lgrp) 1645 { 1646 cpu_t *cpu; 1647 int ncpu; 1648 uint64_t loads = 0; 1649 1650 mutex_enter(&cpu_lock); 1651 1652 cpu = lgrp->lgrp_cpu; 1653 ncpu = lgrp->lgrp_cpucnt; 1654 1655 if (cpu == NULL || ncpu == 0) { 1656 mutex_exit(&cpu_lock); 1657 return (0ull); 1658 } 1659 1660 do { 1661 loads += cpu->cpu_lpl->lpl_loadavg; 1662 cpu = cpu->cpu_next_lgrp; 1663 } while (cpu != lgrp->lgrp_cpu); 1664 1665 mutex_exit(&cpu_lock); 1666 1667 return (loads / ncpu); 1668 } 1669 1670 void 1671 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val) 1672 { 1673 struct lgrp_stats *pstats; 1674 1675 /* 1676 * Verify that the caller isn't trying to add to 1677 * a statistic for an lgroup that has gone away 1678 */ 1679 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1680 return; 1681 1682 pstats = &lgrp_stats[lgrpid]; 1683 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val); 1684 } 1685 1686 int64_t 1687 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat) 1688 { 1689 uint64_t val; 1690 struct lgrp_stats *pstats; 1691 1692 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1693 return ((int64_t)0); 1694 1695 pstats = &lgrp_stats[lgrpid]; 1696 LGRP_STAT_READ(pstats, stat, val); 1697 return (val); 1698 } 1699 1700 /* 1701 * Reset all kstats for lgrp specified by its lgrpid. 1702 */ 1703 static void 1704 lgrp_kstat_reset(lgrp_id_t lgrpid) 1705 { 1706 lgrp_stat_t stat; 1707 1708 if (lgrpid < 0 || lgrpid > lgrp_alloc_max) 1709 return; 1710 1711 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1712 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat); 1713 } 1714 } 1715 1716 /* 1717 * Collect all per-lgrp statistics for the lgrp associated with this 1718 * kstat, and store them in the ks_data array. 1719 * 1720 * The superuser can reset all the running counter statistics for an 1721 * lgrp by writing to any of the lgrp's stats. 1722 */ 1723 static int 1724 lgrp_kstat_extract(kstat_t *ksp, int rw) 1725 { 1726 lgrp_stat_t stat; 1727 struct kstat_named *ksd; 1728 lgrp_t *lgrp; 1729 lgrp_id_t lgrpid; 1730 1731 lgrp = (lgrp_t *)ksp->ks_private; 1732 1733 ksd = (struct kstat_named *)ksp->ks_data; 1734 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data); 1735 1736 lgrpid = lgrp->lgrp_id; 1737 1738 if (lgrpid == LGRP_NONE) { 1739 /* 1740 * Return all zeroes as stats for freed lgrp. 1741 */ 1742 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1743 ksd[stat].value.i64 = 0; 1744 } 1745 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0; 1746 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0; 1747 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0; 1748 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0; 1749 ksd[stat + LGRP_LOADAVG].value.i64 = 0; 1750 } else if (rw != KSTAT_WRITE) { 1751 /* 1752 * Handle counter stats 1753 */ 1754 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) { 1755 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat); 1756 } 1757 1758 /* 1759 * Handle kernel data snapshot stats 1760 */ 1761 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt; 1762 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 1763 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL); 1764 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 1765 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL); 1766 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 1767 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 1768 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp); 1769 } else { 1770 lgrp_kstat_reset(lgrpid); 1771 } 1772 1773 return (0); 1774 } 1775 1776 int 1777 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp) 1778 { 1779 cpu_t *cp; 1780 1781 mutex_enter(&cpu_lock); 1782 1783 if ((cp = cpu_get(id)) == NULL) { 1784 mutex_exit(&cpu_lock); 1785 return (EINVAL); 1786 } 1787 1788 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) { 1789 mutex_exit(&cpu_lock); 1790 return (EINVAL); 1791 } 1792 1793 ASSERT(cp->cpu_lpl != NULL); 1794 1795 *lp = cp->cpu_lpl->lpl_lgrpid; 1796 1797 mutex_exit(&cpu_lock); 1798 1799 return (0); 1800 } 1801 1802 int 1803 lgrp_query_load(processorid_t id, lgrp_load_t *lp) 1804 { 1805 cpu_t *cp; 1806 1807 mutex_enter(&cpu_lock); 1808 1809 if ((cp = cpu_get(id)) == NULL) { 1810 mutex_exit(&cpu_lock); 1811 return (EINVAL); 1812 } 1813 1814 ASSERT(cp->cpu_lpl != NULL); 1815 1816 *lp = cp->cpu_lpl->lpl_loadavg; 1817 1818 mutex_exit(&cpu_lock); 1819 1820 return (0); 1821 } 1822 1823 void 1824 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime) 1825 { 1826 lgrp_t *lgrp; 1827 int i; 1828 1829 for (i = 0; i <= lgrp_alloc_max; i++) { 1830 lgrp = lgrp_table[i]; 1831 1832 if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime)) 1833 lgrp->lgrp_latency = (int)newtime; 1834 } 1835 } 1836 1837 /* 1838 * Add a resource named by lpl_leaf to rset of lpl_target 1839 * 1840 * This routine also adjusts ncpu and nrset if the call succeeds in adding a 1841 * resource. It is adjusted here, as this is presently the only place that we 1842 * can be certain a resource addition has succeeded. 1843 * 1844 * We keep the list of rsets sorted so that the dispatcher can quickly walk the 1845 * list in order until it reaches a NULL. (This list is required to be NULL 1846 * terminated, too). This is done so that we can mark start pos + 1, so that 1847 * each lpl is traversed sequentially, but in a different order. We hope this 1848 * will improve performance a bit. (Hopefully, less read-to-own traffic...) 1849 */ 1850 1851 void 1852 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf) 1853 { 1854 int i; 1855 int entry_slot = 0; 1856 1857 /* return if leaf is already present */ 1858 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1859 if (lpl_target->lpl_rset[i] == lpl_leaf) { 1860 return; 1861 } 1862 1863 if (lpl_target->lpl_rset[i]->lpl_lgrpid > 1864 lpl_leaf->lpl_lgrpid) { 1865 break; 1866 } 1867 } 1868 1869 /* insert leaf, update counts */ 1870 entry_slot = i; 1871 i = lpl_target->lpl_nrset++; 1872 if (lpl_target->lpl_nrset >= LPL_RSET_MAX) { 1873 panic("More leaf lgrps in system than are supported!\n"); 1874 } 1875 1876 /* 1877 * Start at the end of the rset array and work backwards towards the 1878 * slot into which the new lpl will be inserted. This effectively 1879 * preserves the current ordering by scooting everybody over one entry, 1880 * and placing the new entry into the space created. 1881 */ 1882 1883 while (i-- > entry_slot) { 1884 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i]; 1885 } 1886 1887 lpl_target->lpl_rset[entry_slot] = lpl_leaf; 1888 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu; 1889 } 1890 1891 /* 1892 * Update each of lpl_parent's children with a proper hint and 1893 * a reference to their parent. 1894 * The lgrp topology is used as the reference since it is fully 1895 * consistent and correct at this point. 1896 * 1897 * Each child's hint will reference an element in lpl_parent's 1898 * rset that designates where the child should start searching 1899 * for CPU resources. The hint selected is the highest order leaf present 1900 * in the child's lineage. 1901 * 1902 * This should be called after any potential change in lpl_parent's 1903 * rset. 1904 */ 1905 static void 1906 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp) 1907 { 1908 klgrpset_t children, leaves; 1909 lpl_t *lpl; 1910 int hint; 1911 int i, j; 1912 1913 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children; 1914 if (klgrpset_isempty(children)) 1915 return; /* nothing to do */ 1916 1917 for (i = 0; i <= lgrp_alloc_max; i++) { 1918 if (klgrpset_ismember(children, i)) { 1919 1920 /* 1921 * Given the set of leaves in this child's lineage, 1922 * find the highest order leaf present in the parent's 1923 * rset. Select this as the hint for the child. 1924 */ 1925 leaves = lgrp_table[i]->lgrp_leaves; 1926 hint = 0; 1927 for (j = 0; j < lpl_parent->lpl_nrset; j++) { 1928 lpl = lpl_parent->lpl_rset[j]; 1929 if (klgrpset_ismember(leaves, lpl->lpl_lgrpid)) 1930 hint = j; 1931 } 1932 cp->cp_lgrploads[i].lpl_hint = hint; 1933 1934 /* 1935 * (Re)set the parent. It may be incorrect if 1936 * lpl_parent is new in the topology. 1937 */ 1938 cp->cp_lgrploads[i].lpl_parent = lpl_parent; 1939 } 1940 } 1941 } 1942 1943 /* 1944 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there. 1945 * 1946 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a 1947 * resource. The values are adjusted here, as this is the only place that we can 1948 * be certain a resource was successfully deleted. 1949 */ 1950 void 1951 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf) 1952 { 1953 int i; 1954 1955 /* find leaf in intermediate node */ 1956 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1957 if (lpl_target->lpl_rset[i] == lpl_leaf) 1958 break; 1959 } 1960 1961 /* return if leaf not found */ 1962 if (lpl_target->lpl_rset[i] != lpl_leaf) 1963 return; 1964 1965 /* prune leaf, compress array */ 1966 ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX); 1967 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL; 1968 lpl_target->lpl_ncpu--; 1969 do { 1970 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1]; 1971 } while (i++ < lpl_target->lpl_nrset); 1972 } 1973 1974 /* 1975 * Check to see if the resource set of the target lpl contains the 1976 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not. 1977 */ 1978 1979 int 1980 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf) 1981 { 1982 int i; 1983 1984 for (i = 0; i < lpl_target->lpl_nrset; i++) { 1985 if (lpl_target->lpl_rset[i] == lpl_leaf) 1986 return (1); 1987 } 1988 1989 return (0); 1990 } 1991 1992 /* 1993 * Called when we change cpu lpl membership. This increments or decrements the 1994 * per-cpu counter in every lpl in which our leaf appears. 1995 */ 1996 void 1997 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp) 1998 { 1999 cpupart_t *cpupart; 2000 lgrp_t *lgrp_leaf; 2001 lgrp_t *lgrp_cur; 2002 lpl_t *lpl_leaf; 2003 lpl_t *lpl_cur; 2004 int i; 2005 2006 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT); 2007 2008 cpupart = cp->cpu_part; 2009 lpl_leaf = cp->cpu_lpl; 2010 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid]; 2011 2012 for (i = 0; i <= lgrp_alloc_max; i++) { 2013 lgrp_cur = lgrp_table[i]; 2014 2015 /* 2016 * Don't adjust if the lgrp isn't there, if we're the leaf lpl 2017 * for the cpu in question, or if the current lgrp and leaf 2018 * don't share the same resources. 2019 */ 2020 2021 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) || 2022 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU], 2023 lgrp_cur->lgrp_set[LGRP_RSRC_CPU])) 2024 continue; 2025 2026 2027 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2028 2029 if (lpl_cur->lpl_nrset > 0) { 2030 if (act == LPL_INCREMENT) { 2031 lpl_cur->lpl_ncpu++; 2032 } else if (act == LPL_DECREMENT) { 2033 lpl_cur->lpl_ncpu--; 2034 } 2035 } 2036 } 2037 } 2038 2039 /* 2040 * Initialize lpl with given resources and specified lgrp 2041 */ 2042 2043 void 2044 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp) 2045 { 2046 lpl->lpl_lgrpid = lgrp->lgrp_id; 2047 lpl->lpl_loadavg = 0; 2048 if (lpl == lpl_leaf) 2049 lpl->lpl_ncpu = 1; 2050 else 2051 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu; 2052 lpl->lpl_nrset = 1; 2053 lpl->lpl_rset[0] = lpl_leaf; 2054 lpl->lpl_lgrp = lgrp; 2055 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */ 2056 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */ 2057 } 2058 2059 /* 2060 * Clear an unused lpl 2061 */ 2062 2063 void 2064 lpl_clear(lpl_t *lpl) 2065 { 2066 lgrpid_t lid; 2067 2068 /* save lid for debugging purposes */ 2069 lid = lpl->lpl_lgrpid; 2070 bzero(lpl, sizeof (lpl_t)); 2071 lpl->lpl_lgrpid = lid; 2072 } 2073 2074 /* 2075 * Given a CPU-partition, verify that the lpl topology in the CPU-partition 2076 * is in sync with the lgroup toplogy in the system. The lpl topology may not 2077 * make full use of all of the lgroup topology, but this checks to make sure 2078 * that for the parts that it does use, it has correctly understood the 2079 * relationships that exist. This function returns 2080 * 0 if the topology is correct, and a non-zero error code, for non-debug 2081 * kernels if incorrect. Asserts are spread throughout the code to aid in 2082 * debugging on a DEBUG kernel. 2083 */ 2084 int 2085 lpl_topo_verify(cpupart_t *cpupart) 2086 { 2087 lgrp_t *lgrp; 2088 lpl_t *lpl; 2089 klgrpset_t rset; 2090 klgrpset_t cset; 2091 cpu_t *cpu; 2092 cpu_t *cp_start; 2093 int i; 2094 int j; 2095 int sum; 2096 2097 /* topology can't be incorrect if it doesn't exist */ 2098 if (!lgrp_topo_initialized || !lgrp_initialized) 2099 return (LPL_TOPO_CORRECT); 2100 2101 ASSERT(cpupart != NULL); 2102 2103 for (i = 0; i <= lgrp_alloc_max; i++) { 2104 lgrp = lgrp_table[i]; 2105 lpl = NULL; 2106 /* make sure lpls are allocated */ 2107 ASSERT(cpupart->cp_lgrploads); 2108 if (!cpupart->cp_lgrploads) 2109 return (LPL_TOPO_PART_HAS_NO_LPL); 2110 2111 lpl = &cpupart->cp_lgrploads[i]; 2112 /* make sure our index is good */ 2113 ASSERT(i < cpupart->cp_nlgrploads); 2114 2115 /* if lgroup doesn't exist, make sure lpl is empty */ 2116 if (!LGRP_EXISTS(lgrp)) { 2117 ASSERT(lpl->lpl_ncpu == 0); 2118 if (lpl->lpl_ncpu > 0) { 2119 return (LPL_TOPO_CPUS_NOT_EMPTY); 2120 } else { 2121 continue; 2122 } 2123 } 2124 2125 /* verify that lgroup and lpl are identically numbered */ 2126 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid); 2127 2128 /* if lgroup isn't in our partition, make sure lpl is empty */ 2129 if (!klgrpset_intersects(lgrp->lgrp_leaves, 2130 cpupart->cp_lgrpset)) { 2131 ASSERT(lpl->lpl_ncpu == 0); 2132 if (lpl->lpl_ncpu > 0) { 2133 return (LPL_TOPO_CPUS_NOT_EMPTY); 2134 } 2135 /* 2136 * lpl is empty, and lgroup isn't in partition. verify 2137 * that lpl doesn't show up in anyone else's rsets (in 2138 * this partition, anyway) 2139 */ 2140 2141 for (j = 0; j < cpupart->cp_nlgrploads; j++) { 2142 lpl_t *i_lpl; /* lpl we're iterating over */ 2143 2144 i_lpl = &cpupart->cp_lgrploads[j]; 2145 2146 ASSERT(!lpl_rset_contains(i_lpl, lpl)); 2147 if (lpl_rset_contains(i_lpl, lpl)) { 2148 return (LPL_TOPO_LPL_ORPHANED); 2149 } 2150 } 2151 /* lgroup is empty, and everything is ok. continue */ 2152 continue; 2153 } 2154 2155 2156 /* lgroup is in this partition, now check it against lpl */ 2157 2158 /* do both have matching lgrps? */ 2159 ASSERT(lgrp == lpl->lpl_lgrp); 2160 if (lgrp != lpl->lpl_lgrp) { 2161 return (LPL_TOPO_LGRP_MISMATCH); 2162 } 2163 2164 /* do the parent lgroups exist and do they match? */ 2165 if (lgrp->lgrp_parent) { 2166 ASSERT(lpl->lpl_parent); 2167 ASSERT(lgrp->lgrp_parent->lgrp_id == 2168 lpl->lpl_parent->lpl_lgrpid); 2169 2170 if (!lpl->lpl_parent) { 2171 return (LPL_TOPO_MISSING_PARENT); 2172 } else if (lgrp->lgrp_parent->lgrp_id != 2173 lpl->lpl_parent->lpl_lgrpid) { 2174 return (LPL_TOPO_PARENT_MISMATCH); 2175 } 2176 } 2177 2178 /* only leaf lgroups keep a cpucnt, only check leaves */ 2179 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) { 2180 2181 /* verify that lgrp is also a leaf */ 2182 ASSERT((lgrp->lgrp_childcnt == 0) && 2183 (klgrpset_ismember(lgrp->lgrp_leaves, 2184 lpl->lpl_lgrpid))); 2185 2186 if ((lgrp->lgrp_childcnt > 0) || 2187 (!klgrpset_ismember(lgrp->lgrp_leaves, 2188 lpl->lpl_lgrpid))) { 2189 return (LPL_TOPO_LGRP_NOT_LEAF); 2190 } 2191 2192 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) && 2193 (lpl->lpl_ncpu > 0)); 2194 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) || 2195 (lpl->lpl_ncpu <= 0)) { 2196 return (LPL_TOPO_BAD_CPUCNT); 2197 } 2198 2199 /* 2200 * Check that lpl_ncpu also matches the number of 2201 * cpus in the lpl's linked list. This only exists in 2202 * leaves, but they should always match. 2203 */ 2204 j = 0; 2205 cpu = cp_start = lpl->lpl_cpus; 2206 while (cpu != NULL) { 2207 j++; 2208 2209 /* check to make sure cpu's lpl is leaf lpl */ 2210 ASSERT(cpu->cpu_lpl == lpl); 2211 if (cpu->cpu_lpl != lpl) { 2212 return (LPL_TOPO_CPU_HAS_BAD_LPL); 2213 } 2214 2215 /* check next cpu */ 2216 if ((cpu = cpu->cpu_next_lpl) != cp_start) { 2217 continue; 2218 } else { 2219 cpu = NULL; 2220 } 2221 } 2222 2223 ASSERT(j == lpl->lpl_ncpu); 2224 if (j != lpl->lpl_ncpu) { 2225 return (LPL_TOPO_LPL_BAD_NCPU); 2226 } 2227 2228 /* 2229 * Also, check that leaf lpl is contained in all 2230 * intermediate lpls that name the leaf as a descendant 2231 */ 2232 2233 for (j = 0; j <= lgrp_alloc_max; j++) { 2234 klgrpset_t intersect; 2235 lgrp_t *lgrp_cand; 2236 lpl_t *lpl_cand; 2237 2238 lgrp_cand = lgrp_table[j]; 2239 intersect = klgrpset_intersects( 2240 lgrp_cand->lgrp_set[LGRP_RSRC_CPU], 2241 cpupart->cp_lgrpset); 2242 2243 if (!LGRP_EXISTS(lgrp_cand) || 2244 !klgrpset_intersects(lgrp_cand->lgrp_leaves, 2245 cpupart->cp_lgrpset) || 2246 (intersect == 0)) 2247 continue; 2248 2249 lpl_cand = 2250 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2251 2252 if (klgrpset_ismember(intersect, 2253 lgrp->lgrp_id)) { 2254 ASSERT(lpl_rset_contains(lpl_cand, 2255 lpl)); 2256 2257 if (!lpl_rset_contains(lpl_cand, lpl)) { 2258 return (LPL_TOPO_RSET_MSSNG_LF); 2259 } 2260 } 2261 } 2262 2263 } else { /* non-leaf specific checks */ 2264 2265 /* 2266 * Non-leaf lpls should have lpl_cpus == NULL 2267 * verify that this is so 2268 */ 2269 ASSERT(lpl->lpl_cpus == NULL); 2270 if (lpl->lpl_cpus != NULL) { 2271 return (LPL_TOPO_NONLEAF_HAS_CPUS); 2272 } 2273 2274 /* 2275 * verify that the sum of the cpus in the leaf resources 2276 * is equal to the total ncpu in the intermediate 2277 */ 2278 for (j = sum = 0; j < lpl->lpl_nrset; j++) { 2279 sum += lpl->lpl_rset[j]->lpl_ncpu; 2280 } 2281 2282 ASSERT(sum == lpl->lpl_ncpu); 2283 if (sum != lpl->lpl_ncpu) { 2284 return (LPL_TOPO_LPL_BAD_NCPU); 2285 } 2286 } 2287 2288 /* 2289 * check on lpl_hint. Don't check root, since it has no parent. 2290 */ 2291 if (lpl->lpl_parent != NULL) { 2292 int hint; 2293 lpl_t *hint_lpl; 2294 2295 /* make sure hint is within limits of nrset */ 2296 hint = lpl->lpl_hint; 2297 ASSERT(lpl->lpl_parent->lpl_nrset >= hint); 2298 if (lpl->lpl_parent->lpl_nrset < hint) { 2299 return (LPL_TOPO_BOGUS_HINT); 2300 } 2301 2302 /* make sure hint points to valid lpl */ 2303 hint_lpl = lpl->lpl_parent->lpl_rset[hint]; 2304 ASSERT(hint_lpl->lpl_ncpu > 0); 2305 if (hint_lpl->lpl_ncpu <= 0) { 2306 return (LPL_TOPO_BOGUS_HINT); 2307 } 2308 } 2309 2310 /* 2311 * Check the rset of the lpl in question. Make sure that each 2312 * rset contains a subset of the resources in 2313 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes 2314 * sure that each rset doesn't include resources that are 2315 * outside of that set. (Which would be resources somehow not 2316 * accounted for). 2317 */ 2318 2319 klgrpset_clear(rset); 2320 for (j = 0; j < lpl->lpl_nrset; j++) { 2321 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid); 2322 } 2323 klgrpset_copy(cset, rset); 2324 /* make sure lpl rset matches lgrp rset */ 2325 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]); 2326 /* make sure rset is contained with in partition, too */ 2327 klgrpset_diff(cset, cpupart->cp_lgrpset); 2328 2329 ASSERT(klgrpset_isempty(rset) && 2330 klgrpset_isempty(cset)); 2331 if (!klgrpset_isempty(rset) || 2332 !klgrpset_isempty(cset)) { 2333 return (LPL_TOPO_RSET_MISMATCH); 2334 } 2335 2336 /* 2337 * check to make sure lpl_nrset matches the number of rsets 2338 * contained in the lpl 2339 */ 2340 2341 for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX); 2342 j++); 2343 2344 ASSERT(j == lpl->lpl_nrset); 2345 if (j != lpl->lpl_nrset) { 2346 return (LPL_TOPO_BAD_RSETCNT); 2347 } 2348 2349 } 2350 return (LPL_TOPO_CORRECT); 2351 } 2352 2353 /* 2354 * Flatten lpl topology to given number of levels. This is presently only 2355 * implemented for a flatten to 2 levels, which will prune out the intermediates 2356 * and home the leaf lpls to the root lpl. 2357 */ 2358 int 2359 lpl_topo_flatten(int levels) 2360 { 2361 int i; 2362 uint_t sum; 2363 lgrp_t *lgrp_cur; 2364 lpl_t *lpl_cur; 2365 lpl_t *lpl_root; 2366 cpupart_t *cp; 2367 2368 if (levels != 2) 2369 return (0); 2370 2371 /* called w/ cpus paused - grab no locks! */ 2372 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 2373 !lgrp_initialized); 2374 2375 cp = cp_list_head; 2376 do { 2377 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id]; 2378 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0)); 2379 2380 for (i = 0; i <= lgrp_alloc_max; i++) { 2381 lgrp_cur = lgrp_table[i]; 2382 lpl_cur = &cp->cp_lgrploads[i]; 2383 2384 if ((lgrp_cur == lgrp_root) || 2385 (!LGRP_EXISTS(lgrp_cur) && 2386 (lpl_cur->lpl_ncpu == 0))) 2387 continue; 2388 2389 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) { 2390 /* 2391 * this should be a deleted intermediate, so 2392 * clear it 2393 */ 2394 lpl_clear(lpl_cur); 2395 } else if ((lpl_cur->lpl_nrset == 1) && 2396 (lpl_cur->lpl_rset[0] == lpl_cur) && 2397 ((lpl_cur->lpl_parent->lpl_ncpu == 0) || 2398 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) { 2399 /* 2400 * this is a leaf whose parent was deleted, or 2401 * whose parent had their lgrp deleted. (And 2402 * whose parent will soon be deleted). Point 2403 * this guy back to the root lpl. 2404 */ 2405 lpl_cur->lpl_parent = lpl_root; 2406 lpl_rset_add(lpl_root, lpl_cur); 2407 } 2408 2409 } 2410 2411 /* 2412 * Now that we're done, make sure the count on the root lpl is 2413 * correct, and update the hints of the children for the sake of 2414 * thoroughness 2415 */ 2416 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) { 2417 sum += lpl_root->lpl_rset[i]->lpl_ncpu; 2418 } 2419 lpl_root->lpl_ncpu = sum; 2420 lpl_child_update(lpl_root, cp); 2421 2422 cp = cp->cp_next; 2423 } while (cp != cp_list_head); 2424 2425 return (levels); 2426 } 2427 2428 /* 2429 * Insert a lpl into the resource hierarchy and create any additional lpls that 2430 * are necessary to represent the varying states of locality for the cpu 2431 * resoruces newly added to the partition. 2432 * 2433 * This routine is clever enough that it can correctly add resources from the 2434 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie, 2435 * those for which the lpl is a leaf as opposed to simply a named equally local 2436 * resource). The one special case that needs additional processing is when a 2437 * new intermediate lpl is introduced. Since the main loop only traverses 2438 * looking to add the leaf resource where it does not yet exist, additional work 2439 * is necessary to add other leaf resources that may need to exist in the newly 2440 * created intermediate. This is performed by the second inner loop, and is 2441 * only done when the check for more than one overlapping resource succeeds. 2442 */ 2443 2444 void 2445 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart) 2446 { 2447 int i; 2448 int j; 2449 int hint; 2450 int rset_num_intersect; 2451 lgrp_t *lgrp_cur; 2452 lpl_t *lpl_cur; 2453 lpl_t *lpl_parent; 2454 lgrpid_t parent_id; 2455 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */ 2456 2457 for (i = 0; i <= lgrp_alloc_max; i++) { 2458 lgrp_cur = lgrp_table[i]; 2459 2460 /* 2461 * Don't insert if the lgrp isn't there, if the leaf isn't 2462 * contained within the current lgrp, or if the current lgrp has 2463 * no leaves in this partition 2464 */ 2465 2466 if (!LGRP_EXISTS(lgrp_cur) || 2467 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2468 lpl_leaf->lpl_lgrpid) || 2469 !klgrpset_intersects(lgrp_cur->lgrp_leaves, 2470 cpupart->cp_lgrpset)) 2471 continue; 2472 2473 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2474 if (lgrp_cur->lgrp_parent != NULL) { 2475 /* if lgrp has a parent, assign it properly */ 2476 parent_id = lgrp_cur->lgrp_parent->lgrp_id; 2477 lpl_parent = &cpupart->cp_lgrploads[parent_id]; 2478 } else { 2479 /* if not, make sure parent ptr gets set to null */ 2480 lpl_parent = NULL; 2481 } 2482 2483 if (lpl_cur == lpl_leaf) { 2484 /* 2485 * Almost all leaf state was initialized elsewhere. The 2486 * only thing left to do is to set the parent. 2487 */ 2488 lpl_cur->lpl_parent = lpl_parent; 2489 continue; 2490 } 2491 2492 /* 2493 * Initialize intermediate lpl 2494 * Save this lpl's hint though. Since we're changing this 2495 * lpl's resources, we need to update the hint in this lpl's 2496 * children, but the hint in this lpl is unaffected and 2497 * should be preserved. 2498 */ 2499 hint = lpl_cur->lpl_hint; 2500 2501 lpl_clear(lpl_cur); 2502 lpl_init(lpl_cur, lpl_leaf, lgrp_cur); 2503 2504 lpl_cur->lpl_hint = hint; 2505 lpl_cur->lpl_parent = lpl_parent; 2506 2507 /* does new lpl need to be populated with other resources? */ 2508 rset_intersect = 2509 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2510 cpupart->cp_lgrpset); 2511 klgrpset_nlgrps(rset_intersect, rset_num_intersect); 2512 2513 if (rset_num_intersect > 1) { 2514 /* 2515 * If so, figure out what lpls have resources that 2516 * intersect this one, and add them. 2517 */ 2518 for (j = 0; j <= lgrp_alloc_max; j++) { 2519 lgrp_t *lgrp_cand; /* candidate lgrp */ 2520 lpl_t *lpl_cand; /* candidate lpl */ 2521 2522 lgrp_cand = lgrp_table[j]; 2523 if (!LGRP_EXISTS(lgrp_cand) || 2524 !klgrpset_ismember(rset_intersect, 2525 lgrp_cand->lgrp_id)) 2526 continue; 2527 lpl_cand = 2528 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id]; 2529 lpl_rset_add(lpl_cur, lpl_cand); 2530 } 2531 } 2532 /* 2533 * This lpl's rset has changed. Update the hint in it's 2534 * children. 2535 */ 2536 lpl_child_update(lpl_cur, cpupart); 2537 } 2538 } 2539 2540 /* 2541 * remove a lpl from the hierarchy of resources, clearing its state when 2542 * finished. If the lpls at the intermediate levels of the hierarchy have no 2543 * remaining resources, or no longer name a leaf resource in the cpu-partition, 2544 * delete them as well. 2545 */ 2546 2547 void 2548 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart) 2549 { 2550 int i; 2551 lgrp_t *lgrp_cur; 2552 lpl_t *lpl_cur; 2553 klgrpset_t leaf_intersect; /* intersection of leaves */ 2554 2555 for (i = 0; i <= lgrp_alloc_max; i++) { 2556 lgrp_cur = lgrp_table[i]; 2557 2558 /* 2559 * Don't attempt to remove from lgrps that aren't there, that 2560 * don't contain our leaf, or from the leaf itself. (We do that 2561 * later) 2562 */ 2563 2564 if (!LGRP_EXISTS(lgrp_cur)) 2565 continue; 2566 2567 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id]; 2568 2569 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU], 2570 lpl_leaf->lpl_lgrpid) || 2571 (lpl_cur == lpl_leaf)) { 2572 continue; 2573 } 2574 2575 /* 2576 * This is a slightly sleazy simplification in that we have 2577 * already marked the cp_lgrpset as no longer containing the 2578 * leaf we've deleted. Any lpls that pass the above checks 2579 * based upon lgrp membership but not necessarily cpu-part 2580 * membership also get cleared by the checks below. Currently 2581 * this is harmless, as the lpls should be empty anyway. 2582 * 2583 * In particular, we want to preserve lpls that have additional 2584 * leaf resources, even though we don't yet have a processor 2585 * architecture that represents resources this way. 2586 */ 2587 2588 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves, 2589 cpupart->cp_lgrpset); 2590 2591 lpl_rset_del(lpl_cur, lpl_leaf); 2592 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) { 2593 lpl_clear(lpl_cur); 2594 } else { 2595 /* 2596 * Update this lpl's children 2597 */ 2598 lpl_child_update(lpl_cur, cpupart); 2599 } 2600 } 2601 lpl_clear(lpl_leaf); 2602 } 2603 2604 /* 2605 * add a cpu to a partition in terms of lgrp load avg bookeeping 2606 * 2607 * The lpl (cpu partition load average information) is now arranged in a 2608 * hierarchical fashion whereby resources that are closest, ie. most local, to 2609 * the cpu in question are considered to be leaves in a tree of resources. 2610 * There are two general cases for cpu additon: 2611 * 2612 * 1. A lpl structure that contains resources already in the hierarchy tree. 2613 * In this case, all of the associated lpl relationships have been defined, and 2614 * all that is necessary is that we link the new cpu into the per-lpl list of 2615 * cpus, and increment the ncpu count of all places where this cpu resource will 2616 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer 2617 * pushing is accomplished by this routine. 2618 * 2619 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does 2620 * not exist yet. In this case, it is necessary to build the leaf lpl, and 2621 * construct the hierarchy of state necessary to name it's more distant 2622 * resources, if they should exist. The leaf structure is initialized by this 2623 * routine, as is the cpu-partition state for the lgrp membership. This routine 2624 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy 2625 * and builds all of the "ancestoral" state necessary to identify resources at 2626 * differing levels of locality. 2627 */ 2628 void 2629 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid) 2630 { 2631 cpupart_t *cpupart; 2632 lgrp_t *lgrp_leaf; 2633 lpl_t *lpl_leaf; 2634 2635 /* called sometimes w/ cpus paused - grab no locks */ 2636 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2637 2638 cpupart = cp->cpu_part; 2639 lgrp_leaf = lgrp_table[lgrpid]; 2640 2641 /* don't add non-existent lgrp */ 2642 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2643 lpl_leaf = &cpupart->cp_lgrploads[lgrpid]; 2644 cp->cpu_lpl = lpl_leaf; 2645 2646 /* only leaf lpls contain cpus */ 2647 2648 if (lpl_leaf->lpl_ncpu++ == 0) { 2649 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf); 2650 klgrpset_add(cpupart->cp_lgrpset, lgrpid); 2651 lpl_leaf_insert(lpl_leaf, cpupart); 2652 } else { 2653 /* 2654 * the lpl should already exist in the parent, so just update 2655 * the count of available CPUs 2656 */ 2657 lpl_cpu_adjcnt(LPL_INCREMENT, cp); 2658 } 2659 2660 /* link cpu into list of cpus in lpl */ 2661 2662 if (lpl_leaf->lpl_cpus) { 2663 cp->cpu_next_lpl = lpl_leaf->lpl_cpus; 2664 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl; 2665 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp; 2666 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp; 2667 } else { 2668 /* 2669 * We increment ncpu immediately after we create a new leaf 2670 * lpl, so assert that ncpu == 1 for the case where we don't 2671 * have any cpu pointers yet. 2672 */ 2673 ASSERT(lpl_leaf->lpl_ncpu == 1); 2674 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp; 2675 } 2676 2677 } 2678 2679 2680 /* 2681 * remove a cpu from a partition in terms of lgrp load avg bookeeping 2682 * 2683 * The lpl (cpu partition load average information) is now arranged in a 2684 * hierarchical fashion whereby resources that are closest, ie. most local, to 2685 * the cpu in question are considered to be leaves in a tree of resources. 2686 * There are two removal cases in question: 2687 * 2688 * 1. Removal of the resource in the leaf leaves other resources remaining in 2689 * that leaf. (Another cpu still exists at this level of locality). In this 2690 * case, the count of available cpus is decremented in all assocated lpls by 2691 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned 2692 * from the per-cpu lpl list. 2693 * 2694 * 2. Removal of the resource results in the lpl containing no resources. (It's 2695 * empty) In this case, all of what has occurred for the first step must take 2696 * place; however, additionally we must remove the lpl structure itself, prune 2697 * out any stranded lpls that do not directly name a leaf resource, and mark the 2698 * cpu partition in question as no longer containing resources from the lgrp of 2699 * the lpl that has been delted. Cpu-partition changes are handled by this 2700 * method, but the lpl_leaf_remove function deals with the details of pruning 2701 * out the empty lpl and any of its orphaned direct ancestors. 2702 */ 2703 void 2704 lgrp_part_del_cpu(cpu_t *cp) 2705 { 2706 lpl_t *lpl; 2707 lpl_t *leaf_lpl; 2708 lgrp_t *lgrp_leaf; 2709 2710 /* called sometimes w/ cpus paused - grab no locks */ 2711 2712 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized); 2713 2714 lpl = leaf_lpl = cp->cpu_lpl; 2715 lgrp_leaf = leaf_lpl->lpl_lgrp; 2716 2717 /* don't delete a leaf that isn't there */ 2718 ASSERT(LGRP_EXISTS(lgrp_leaf)); 2719 2720 /* no double-deletes */ 2721 ASSERT(lpl->lpl_ncpu); 2722 if (--lpl->lpl_ncpu == 0) { 2723 /* 2724 * This was the last cpu in this lgroup for this partition, 2725 * clear its bit in the partition's lgroup bitmask 2726 */ 2727 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid); 2728 2729 /* eliminate remaning lpl link pointers in cpu, lpl */ 2730 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL; 2731 2732 lpl_leaf_remove(leaf_lpl, cp->cpu_part); 2733 } else { 2734 2735 /* unlink cpu from lists of cpus in lpl */ 2736 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl; 2737 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl; 2738 if (lpl->lpl_cpus == cp) { 2739 lpl->lpl_cpus = cp->cpu_next_lpl; 2740 } 2741 2742 /* 2743 * Update the cpu count in the lpls associated with parent 2744 * lgroups. 2745 */ 2746 lpl_cpu_adjcnt(LPL_DECREMENT, cp); 2747 2748 } 2749 /* clear cpu's lpl ptr when we're all done */ 2750 cp->cpu_lpl = NULL; 2751 } 2752 2753 /* 2754 * Recompute load average for the specified partition/lgrp fragment. 2755 * 2756 * We rely on the fact that this routine is called from the clock thread 2757 * at a point before the clock thread can block (i.e. before its first 2758 * lock request). Since the clock thread can not be preempted (since it 2759 * runs at highest priority), we know that cpu partitions can not change 2760 * (since doing so would require either the repartition requester or the 2761 * cpu_pause thread to run on this cpu), so we can update the cpu's load 2762 * without grabbing cpu_lock. 2763 */ 2764 void 2765 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag) 2766 { 2767 uint_t ncpu; 2768 int64_t old, new, f; 2769 2770 /* 2771 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu... 2772 */ 2773 static short expval[] = { 2774 0, 3196, 1618, 1083, 2775 814, 652, 543, 466, 2776 408, 363, 326, 297, 2777 272, 251, 233, 218, 2778 204, 192, 181, 172, 2779 163, 155, 148, 142, 2780 136, 130, 125, 121, 2781 116, 112, 109, 105 2782 }; 2783 2784 /* ASSERT (called from clock level) */ 2785 2786 if ((lpl == NULL) || /* we're booting - this is easiest for now */ 2787 ((ncpu = lpl->lpl_ncpu) == 0)) { 2788 return; 2789 } 2790 2791 for (;;) { 2792 2793 if (ncpu >= sizeof (expval) / sizeof (expval[0])) 2794 f = expval[1]/ncpu; /* good approx. for large ncpu */ 2795 else 2796 f = expval[ncpu]; 2797 2798 /* 2799 * Modify the load average atomically to avoid losing 2800 * anticipatory load updates (see lgrp_move_thread()). 2801 */ 2802 if (ageflag) { 2803 /* 2804 * We're supposed to both update and age the load. 2805 * This happens 10 times/sec. per cpu. We do a 2806 * little hoop-jumping to avoid integer overflow. 2807 */ 2808 int64_t q, r; 2809 2810 do { 2811 old = new = lpl->lpl_loadavg; 2812 q = (old >> 16) << 7; 2813 r = (old & 0xffff) << 7; 2814 new += ((long long)(nrcpus - q) * f - 2815 ((r * f) >> 16)) >> 7; 2816 2817 /* 2818 * Check for overflow 2819 */ 2820 if (new > LGRP_LOADAVG_MAX) 2821 new = LGRP_LOADAVG_MAX; 2822 else if (new < 0) 2823 new = 0; 2824 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2825 new) != old); 2826 } else { 2827 /* 2828 * We're supposed to update the load, but not age it. 2829 * This option is used to update the load (which either 2830 * has already been aged in this 1/10 sec. interval or 2831 * soon will be) to account for a remotely executing 2832 * thread. 2833 */ 2834 do { 2835 old = new = lpl->lpl_loadavg; 2836 new += f; 2837 /* 2838 * Check for overflow 2839 * Underflow not possible here 2840 */ 2841 if (new < old) 2842 new = LGRP_LOADAVG_MAX; 2843 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 2844 new) != old); 2845 } 2846 2847 /* 2848 * Do the same for this lpl's parent 2849 */ 2850 if ((lpl = lpl->lpl_parent) == NULL) 2851 break; 2852 ncpu = lpl->lpl_ncpu; 2853 } 2854 } 2855 2856 /* 2857 * Initialize lpl topology in the target based on topology currently present in 2858 * lpl_bootstrap. 2859 * 2860 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to 2861 * initialize cp_default list of lpls. Up to this point all topology operations 2862 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls 2863 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The 2864 * `target' points to the list of lpls in cp_default and `size' is the size of 2865 * this list. 2866 * 2867 * This function walks the lpl topology in lpl_bootstrap and does for things: 2868 * 2869 * 1) Copies all fields from lpl_bootstrap to the target. 2870 * 2871 * 2) Sets CPU0 lpl pointer to the correct element of the target list. 2872 * 2873 * 3) Updates lpl_parent pointers to point to the lpls in the target list 2874 * instead of lpl_bootstrap. 2875 * 2876 * 4) Updates pointers in the resource list of the target to point to the lpls 2877 * in the target list instead of lpl_bootstrap. 2878 * 2879 * After lpl_topo_bootstrap() completes, target contains the same information 2880 * that would be present there if it were used during boot instead of 2881 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this 2882 * and it is bzeroed. 2883 */ 2884 void 2885 lpl_topo_bootstrap(lpl_t *target, int size) 2886 { 2887 lpl_t *lpl = lpl_bootstrap; 2888 lpl_t *target_lpl = target; 2889 int howmany; 2890 int id; 2891 int i; 2892 2893 /* 2894 * The only target that should be passed here is cp_default lpl list. 2895 */ 2896 ASSERT(target == cp_default.cp_lgrploads); 2897 ASSERT(size == cp_default.cp_nlgrploads); 2898 ASSERT(!lgrp_topo_initialized); 2899 ASSERT(ncpus == 1); 2900 2901 howmany = MIN(LPL_BOOTSTRAP_SIZE, size); 2902 for (i = 0; i < howmany; i++, lpl++, target_lpl++) { 2903 /* 2904 * Copy all fields from lpl. 2905 */ 2906 2907 *target_lpl = *lpl; 2908 2909 /* 2910 * Substitute CPU0 lpl pointer with one relative to target. 2911 */ 2912 if (lpl->lpl_cpus == CPU) { 2913 ASSERT(CPU->cpu_lpl == lpl); 2914 CPU->cpu_lpl = target_lpl; 2915 } 2916 2917 /* 2918 * Substitute parent information with parent relative to target. 2919 */ 2920 if (lpl->lpl_parent != NULL) 2921 target_lpl->lpl_parent = (lpl_t *) 2922 (((uintptr_t)lpl->lpl_parent - 2923 (uintptr_t)lpl_bootstrap) + 2924 (uintptr_t)target); 2925 2926 /* 2927 * Walk over resource set substituting pointers relative to 2928 * lpl_bootstrap to pointers relative to target. 2929 */ 2930 ASSERT(lpl->lpl_nrset <= 1); 2931 2932 for (id = 0; id < lpl->lpl_nrset; id++) { 2933 if (lpl->lpl_rset[id] != NULL) { 2934 target_lpl->lpl_rset[id] = 2935 (lpl_t *) 2936 (((uintptr_t)lpl->lpl_rset[id] - 2937 (uintptr_t)lpl_bootstrap) + 2938 (uintptr_t)target); 2939 } 2940 } 2941 } 2942 2943 /* 2944 * Topology information in lpl_bootstrap is no longer needed. 2945 */ 2946 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list)); 2947 } 2948 2949 /* the maximum effect that a single thread can have on it's lgroup's load */ 2950 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \ 2951 ((lgrp_loadavg_max_effect) / (ncpu)) 2952 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX; 2953 2954 /* 2955 * If the lowest load among the lgroups a process' threads are currently 2956 * spread across is greater than lgrp_expand_proc_thresh, we'll consider 2957 * expanding the process to a new lgroup. 2958 */ 2959 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250 2960 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT; 2961 2962 #define LGRP_EXPAND_PROC_THRESH(ncpu) \ 2963 ((lgrp_expand_proc_thresh) / (ncpu)) 2964 2965 /* 2966 * A process will be expanded to a new lgroup only if the difference between 2967 * the lowest load on the lgroups the process' thread's are currently spread 2968 * across and the lowest load on the other lgroups in the process' partition 2969 * is greater than lgrp_expand_proc_diff. 2970 */ 2971 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000 2972 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT; 2973 2974 #define LGRP_EXPAND_PROC_DIFF(ncpu) \ 2975 ((lgrp_expand_proc_diff) / (ncpu)) 2976 2977 /* 2978 * The loadavg tolerance accounts for "noise" inherent in the load, which may 2979 * be present due to impreciseness of the load average decay algorithm. 2980 * 2981 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable 2982 * tolerance is scaled by the number of cpus in the lgroup just like 2983 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000, 2984 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads 2985 * of: 0x10000 / 4 => 0x4000 or greater to be significant. 2986 */ 2987 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX; 2988 #define LGRP_LOADAVG_TOLERANCE(ncpu) \ 2989 ((lgrp_loadavg_tolerance) / ncpu) 2990 2991 /* 2992 * lgrp_choose() will choose root lgroup as home when lowest lgroup load 2993 * average is above this threshold 2994 */ 2995 uint32_t lgrp_load_thresh = UINT32_MAX; 2996 2997 /* 2998 * lgrp_choose() will try to skip any lgroups with less memory 2999 * than this free when choosing a home lgroup 3000 */ 3001 pgcnt_t lgrp_mem_free_thresh = 0; 3002 3003 /* 3004 * When choosing between similarly loaded lgroups, lgrp_choose() will pick 3005 * one based on one of the following policies: 3006 * - Random selection 3007 * - Pseudo round robin placement 3008 * - Longest time since a thread was last placed 3009 */ 3010 #define LGRP_CHOOSE_RANDOM 1 3011 #define LGRP_CHOOSE_RR 2 3012 #define LGRP_CHOOSE_TIME 3 3013 3014 int lgrp_choose_policy = LGRP_CHOOSE_TIME; 3015 3016 /* 3017 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to 3018 * be bound to a CPU or processor set. 3019 * 3020 * Arguments: 3021 * t The thread 3022 * cpupart The partition the thread belongs to. 3023 * 3024 * NOTE: Should at least be called with the cpu_lock held, kernel preemption 3025 * disabled, or thread_lock held (at splhigh) to protect against the CPU 3026 * partitions changing out from under us and assumes that given thread is 3027 * protected. Also, called sometimes w/ cpus paused or kernel preemption 3028 * disabled, so don't grab any locks because we should never block under 3029 * those conditions. 3030 */ 3031 lpl_t * 3032 lgrp_choose(kthread_t *t, cpupart_t *cpupart) 3033 { 3034 lgrp_load_t bestload, bestrload; 3035 int lgrpid_offset, lgrp_count; 3036 lgrp_id_t lgrpid, lgrpid_start; 3037 lpl_t *lpl, *bestlpl, *bestrlpl; 3038 klgrpset_t lgrpset; 3039 proc_t *p; 3040 3041 ASSERT(t != NULL); 3042 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3043 THREAD_LOCK_HELD(t)); 3044 ASSERT(cpupart != NULL); 3045 3046 p = t->t_procp; 3047 3048 /* A process should always be in an active partition */ 3049 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset)); 3050 3051 bestlpl = bestrlpl = NULL; 3052 bestload = bestrload = LGRP_LOADAVG_MAX; 3053 lgrpset = cpupart->cp_lgrpset; 3054 3055 switch (lgrp_choose_policy) { 3056 case LGRP_CHOOSE_RR: 3057 lgrpid = cpupart->cp_lgrp_hint; 3058 do { 3059 if (++lgrpid > lgrp_alloc_max) 3060 lgrpid = 0; 3061 } while (!klgrpset_ismember(lgrpset, lgrpid)); 3062 3063 break; 3064 default: 3065 case LGRP_CHOOSE_TIME: 3066 case LGRP_CHOOSE_RANDOM: 3067 klgrpset_nlgrps(lgrpset, lgrp_count); 3068 lgrpid_offset = 3069 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1; 3070 for (lgrpid = 0; ; lgrpid++) { 3071 if (klgrpset_ismember(lgrpset, lgrpid)) { 3072 if (--lgrpid_offset == 0) 3073 break; 3074 } 3075 } 3076 break; 3077 } 3078 3079 lgrpid_start = lgrpid; 3080 3081 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start, 3082 lgrp_id_t, cpupart->cp_lgrp_hint); 3083 3084 /* 3085 * Use lgroup affinities (if any) to choose best lgroup 3086 * 3087 * NOTE: Assumes that thread is protected from going away and its 3088 * lgroup affinities won't change (ie. p_lock, or 3089 * thread_lock() being held and/or CPUs paused) 3090 */ 3091 if (t->t_lgrp_affinity) { 3092 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start); 3093 if (lpl != NULL) 3094 return (lpl); 3095 } 3096 3097 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start)); 3098 bestlpl = &cpupart->cp_lgrploads[lgrpid_start]; 3099 3100 do { 3101 pgcnt_t npgs; 3102 3103 /* 3104 * Skip any lgroups outside of thread's pset 3105 */ 3106 if (!klgrpset_ismember(lgrpset, lgrpid)) { 3107 if (++lgrpid > lgrp_alloc_max) 3108 lgrpid = 0; /* wrap the search */ 3109 continue; 3110 } 3111 3112 /* 3113 * Skip any non-leaf lgroups 3114 */ 3115 if (lgrp_table[lgrpid]->lgrp_childcnt != 0) 3116 continue; 3117 3118 /* 3119 * Skip any lgroups without enough free memory 3120 * (when threshold set to nonzero positive value) 3121 */ 3122 if (lgrp_mem_free_thresh > 0) { 3123 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE); 3124 if (npgs < lgrp_mem_free_thresh) { 3125 if (++lgrpid > lgrp_alloc_max) 3126 lgrpid = 0; /* wrap the search */ 3127 continue; 3128 } 3129 } 3130 3131 lpl = &cpupart->cp_lgrploads[lgrpid]; 3132 if (klgrpset_isempty(p->p_lgrpset) || 3133 klgrpset_ismember(p->p_lgrpset, lgrpid)) { 3134 /* 3135 * Either this is a new process or the process already 3136 * has threads on this lgrp, so this is a preferred 3137 * lgroup for the thread. 3138 */ 3139 if (lpl_pick(lpl, bestlpl)) { 3140 bestload = lpl->lpl_loadavg; 3141 bestlpl = lpl; 3142 } 3143 } else { 3144 /* 3145 * The process doesn't have any threads on this lgrp, 3146 * but we're willing to consider this lgrp if the load 3147 * difference is big enough to justify splitting up 3148 * the process' threads. 3149 */ 3150 if (lpl_pick(lpl, bestrlpl)) { 3151 bestrload = lpl->lpl_loadavg; 3152 bestrlpl = lpl; 3153 } 3154 } 3155 if (++lgrpid > lgrp_alloc_max) 3156 lgrpid = 0; /* wrap the search */ 3157 } while (lgrpid != lgrpid_start); 3158 3159 /* 3160 * Return root lgroup if threshold isn't set to maximum value and 3161 * lowest lgroup load average more than a certain threshold 3162 */ 3163 if (lgrp_load_thresh != UINT32_MAX && 3164 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh) 3165 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]); 3166 3167 /* 3168 * If all the lgroups over which the thread's process is spread are 3169 * heavily loaded, we'll consider placing the thread on one of the 3170 * other leaf lgroups in the thread's partition. 3171 */ 3172 if ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) && 3173 (bestrload < bestload) && /* paranoid about wraparound */ 3174 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) < 3175 bestload)) { 3176 bestlpl = bestrlpl; 3177 } 3178 3179 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid; 3180 bestlpl->lpl_homed_time = gethrtime_unscaled(); 3181 3182 ASSERT(bestlpl->lpl_ncpu > 0); 3183 return (bestlpl); 3184 } 3185 3186 /* 3187 * Return 1 if lpl1 is a better candidate than lpl2 for lgrp homing. 3188 */ 3189 static int 3190 lpl_pick(lpl_t *lpl1, lpl_t *lpl2) 3191 { 3192 lgrp_load_t l1, l2; 3193 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu); 3194 3195 3196 if (lpl2 == NULL) 3197 return (1); 3198 3199 l1 = lpl1->lpl_loadavg; 3200 l2 = lpl2->lpl_loadavg; 3201 3202 if ((l1 + tolerance < l2) && (l1 < l2)) { 3203 /* lpl1 is significantly less loaded than lpl2 */ 3204 return (1); 3205 } 3206 3207 if (lgrp_choose_policy == LGRP_CHOOSE_TIME && 3208 l1 + tolerance >= l2 && l1 < l2 && 3209 lpl1->lpl_homed_time < lpl2->lpl_homed_time) { 3210 /* 3211 * lpl1's load is within the tolerance of lpl2. We're 3212 * willing to consider it be to better however if 3213 * it has been longer since we last homed a thread there 3214 */ 3215 return (1); 3216 } 3217 3218 return (0); 3219 } 3220 3221 /* 3222 * An LWP is expected to be assigned to an lgroup for at least this long 3223 * for its anticipatory load to be justified. NOTE that this value should 3224 * not be set extremely huge (say, larger than 100 years), to avoid problems 3225 * with overflow in the calculation that uses it. 3226 */ 3227 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */ 3228 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC; 3229 3230 /* 3231 * Routine to change a thread's lgroup affiliation. This routine updates 3232 * the thread's kthread_t struct and its process' proc_t struct to note the 3233 * thread's new lgroup affiliation, and its lgroup affinities. 3234 * 3235 * Note that this is the only routine that modifies a thread's t_lpl field, 3236 * and that adds in or removes anticipatory load. 3237 * 3238 * If the thread is exiting, newlpl is NULL. 3239 * 3240 * Locking: 3241 * The following lock must be held on entry: 3242 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp 3243 * doesn't get removed from t's partition 3244 * 3245 * This routine is not allowed to grab any locks, since it may be called 3246 * with cpus paused (such as from cpu_offline). 3247 */ 3248 void 3249 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete) 3250 { 3251 proc_t *p; 3252 lpl_t *lpl, *oldlpl; 3253 lgrp_id_t oldid; 3254 kthread_t *tp; 3255 uint_t ncpu; 3256 lgrp_load_t old, new; 3257 3258 ASSERT(t); 3259 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 || 3260 THREAD_LOCK_HELD(t)); 3261 3262 /* 3263 * If not changing lpls, just return 3264 */ 3265 if ((oldlpl = t->t_lpl) == newlpl) 3266 return; 3267 3268 /* 3269 * Make sure the thread's lwp hasn't exited (if so, this thread is now 3270 * associated with process 0 rather than with its original process). 3271 */ 3272 if (t->t_proc_flag & TP_LWPEXIT) { 3273 if (newlpl != NULL) { 3274 t->t_lpl = newlpl; 3275 } 3276 return; 3277 } 3278 3279 p = ttoproc(t); 3280 3281 /* 3282 * If the thread had a previous lgroup, update its process' p_lgrpset 3283 * to account for it being moved from its old lgroup. 3284 */ 3285 if ((oldlpl != NULL) && /* thread had a previous lgroup */ 3286 (p->p_tlist != NULL)) { 3287 oldid = oldlpl->lpl_lgrpid; 3288 3289 if (newlpl != NULL) 3290 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1); 3291 3292 if ((do_lgrpset_delete) && 3293 (klgrpset_ismember(p->p_lgrpset, oldid))) { 3294 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) { 3295 /* 3296 * Check if a thread other than the thread 3297 * that's moving is assigned to the same 3298 * lgroup as the thread that's moving. Note 3299 * that we have to compare lgroup IDs, rather 3300 * than simply comparing t_lpl's, since the 3301 * threads may belong to different partitions 3302 * but be assigned to the same lgroup. 3303 */ 3304 ASSERT(tp->t_lpl != NULL); 3305 3306 if ((tp != t) && 3307 (tp->t_lpl->lpl_lgrpid == oldid)) { 3308 /* 3309 * Another thread is assigned to the 3310 * same lgroup as the thread that's 3311 * moving, p_lgrpset doesn't change. 3312 */ 3313 break; 3314 } else if (tp == p->p_tlist) { 3315 /* 3316 * No other thread is assigned to the 3317 * same lgroup as the exiting thread, 3318 * clear the lgroup's bit in p_lgrpset. 3319 */ 3320 klgrpset_del(p->p_lgrpset, oldid); 3321 break; 3322 } 3323 } 3324 } 3325 3326 /* 3327 * If this thread was assigned to its old lgroup for such a 3328 * short amount of time that the anticipatory load that was 3329 * added on its behalf has aged very little, remove that 3330 * anticipatory load. 3331 */ 3332 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) && 3333 ((ncpu = oldlpl->lpl_ncpu) > 0)) { 3334 lpl = oldlpl; 3335 for (;;) { 3336 do { 3337 old = new = lpl->lpl_loadavg; 3338 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu); 3339 if (new > old) { 3340 /* 3341 * this can happen if the load 3342 * average was aged since we 3343 * added in the anticipatory 3344 * load 3345 */ 3346 new = 0; 3347 } 3348 } while (cas32( 3349 (lgrp_load_t *)&lpl->lpl_loadavg, old, 3350 new) != old); 3351 3352 lpl = lpl->lpl_parent; 3353 if (lpl == NULL) 3354 break; 3355 3356 ncpu = lpl->lpl_ncpu; 3357 ASSERT(ncpu > 0); 3358 } 3359 } 3360 } 3361 /* 3362 * If the thread has a new lgroup (i.e. it's not exiting), update its 3363 * t_lpl and its process' p_lgrpset, and apply an anticipatory load 3364 * to its new lgroup to account for its move to its new lgroup. 3365 */ 3366 if (newlpl != NULL) { 3367 /* 3368 * This thread is moving to a new lgroup 3369 */ 3370 t->t_lpl = newlpl; 3371 3372 /* 3373 * Reflect move in load average of new lgroup 3374 * unless it is root lgroup 3375 */ 3376 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root) 3377 return; 3378 3379 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) { 3380 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid); 3381 } 3382 3383 /* 3384 * It'll take some time for the load on the new lgroup 3385 * to reflect this thread's placement on it. We'd 3386 * like not, however, to have all threads between now 3387 * and then also piling on to this lgroup. To avoid 3388 * this pileup, we anticipate the load this thread 3389 * will generate on its new lgroup. The goal is to 3390 * make the lgroup's load appear as though the thread 3391 * had been there all along. We're very conservative 3392 * in calculating this anticipatory load, we assume 3393 * the worst case case (100% CPU-bound thread). This 3394 * may be modified in the future to be more accurate. 3395 */ 3396 lpl = newlpl; 3397 for (;;) { 3398 ncpu = lpl->lpl_ncpu; 3399 ASSERT(ncpu > 0); 3400 do { 3401 old = new = lpl->lpl_loadavg; 3402 new += LGRP_LOADAVG_MAX_EFFECT(ncpu); 3403 /* 3404 * Check for overflow 3405 * Underflow not possible here 3406 */ 3407 if (new < old) 3408 new = UINT32_MAX; 3409 } while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old, 3410 new) != old); 3411 3412 lpl = lpl->lpl_parent; 3413 if (lpl == NULL) 3414 break; 3415 } 3416 t->t_anttime = gethrtime(); 3417 } 3418 } 3419 3420 /* 3421 * Return lgroup memory allocation policy given advice from madvise(3C) 3422 */ 3423 lgrp_mem_policy_t 3424 lgrp_madv_to_policy(uchar_t advice, size_t size, int type) 3425 { 3426 switch (advice) { 3427 case MADV_ACCESS_LWP: 3428 return (LGRP_MEM_POLICY_NEXT); 3429 case MADV_ACCESS_MANY: 3430 return (LGRP_MEM_POLICY_RANDOM); 3431 default: 3432 return (lgrp_mem_policy_default(size, type)); 3433 } 3434 } 3435 3436 /* 3437 * Figure out default policy 3438 */ 3439 lgrp_mem_policy_t 3440 lgrp_mem_policy_default(size_t size, int type) 3441 { 3442 cpupart_t *cp; 3443 lgrp_mem_policy_t policy; 3444 size_t pset_mem_size; 3445 3446 /* 3447 * Randomly allocate memory across lgroups for shared memory 3448 * beyond a certain threshold 3449 */ 3450 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) || 3451 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) { 3452 /* 3453 * Get total memory size of current thread's pset 3454 */ 3455 kpreempt_disable(); 3456 cp = curthread->t_cpupart; 3457 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size); 3458 kpreempt_enable(); 3459 3460 /* 3461 * Choose policy to randomly allocate memory across 3462 * lgroups in pset if it will fit and is not default 3463 * partition. Otherwise, allocate memory randomly 3464 * across machine. 3465 */ 3466 if (lgrp_mem_pset_aware && size < pset_mem_size) 3467 policy = LGRP_MEM_POLICY_RANDOM_PSET; 3468 else 3469 policy = LGRP_MEM_POLICY_RANDOM; 3470 } else 3471 /* 3472 * Apply default policy for private memory and 3473 * shared memory under the respective random 3474 * threshold. 3475 */ 3476 policy = lgrp_mem_default_policy; 3477 3478 return (policy); 3479 } 3480 3481 /* 3482 * Get memory allocation policy for this segment 3483 */ 3484 lgrp_mem_policy_info_t * 3485 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr) 3486 { 3487 lgrp_mem_policy_info_t *policy_info; 3488 extern struct seg_ops segspt_ops; 3489 extern struct seg_ops segspt_shmops; 3490 3491 /* 3492 * This is for binary compatibility to protect against third party 3493 * segment drivers which haven't recompiled to allow for 3494 * SEGOP_GETPOLICY() 3495 */ 3496 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops && 3497 seg->s_ops != &segspt_shmops) 3498 return (NULL); 3499 3500 policy_info = NULL; 3501 if (seg->s_ops->getpolicy != NULL) 3502 policy_info = SEGOP_GETPOLICY(seg, vaddr); 3503 3504 return (policy_info); 3505 } 3506 3507 /* 3508 * Set policy for allocating private memory given desired policy, policy info, 3509 * size in bytes of memory that policy is being applied. 3510 * Return 0 if policy wasn't set already and 1 if policy was set already 3511 */ 3512 int 3513 lgrp_privm_policy_set(lgrp_mem_policy_t policy, 3514 lgrp_mem_policy_info_t *policy_info, size_t size) 3515 { 3516 3517 ASSERT(policy_info != NULL); 3518 3519 if (policy == LGRP_MEM_POLICY_DEFAULT) 3520 policy = lgrp_mem_policy_default(size, MAP_PRIVATE); 3521 3522 /* 3523 * Policy set already? 3524 */ 3525 if (policy == policy_info->mem_policy) 3526 return (1); 3527 3528 /* 3529 * Set policy 3530 */ 3531 policy_info->mem_policy = policy; 3532 policy_info->mem_reserved = 0; 3533 3534 return (0); 3535 } 3536 3537 3538 /* 3539 * Get shared memory allocation policy with given tree and offset 3540 */ 3541 lgrp_mem_policy_info_t * 3542 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp, 3543 u_offset_t vn_off) 3544 { 3545 u_offset_t off; 3546 lgrp_mem_policy_info_t *policy_info; 3547 lgrp_shm_policy_seg_t *policy_seg; 3548 lgrp_shm_locality_t *shm_locality; 3549 avl_tree_t *tree; 3550 avl_index_t where; 3551 3552 /* 3553 * Get policy segment tree from anon_map or vnode and use specified 3554 * anon index or vnode offset as offset 3555 * 3556 * Assume that no lock needs to be held on anon_map or vnode, since 3557 * they should be protected by their reference count which must be 3558 * nonzero for an existing segment 3559 */ 3560 if (amp) { 3561 ASSERT(amp->refcnt != 0); 3562 shm_locality = amp->locality; 3563 if (shm_locality == NULL) 3564 return (NULL); 3565 tree = shm_locality->loc_tree; 3566 off = ptob(anon_index); 3567 } else if (vp) { 3568 shm_locality = vp->v_locality; 3569 if (shm_locality == NULL) 3570 return (NULL); 3571 ASSERT(shm_locality->loc_count != 0); 3572 tree = shm_locality->loc_tree; 3573 off = vn_off; 3574 } 3575 3576 if (tree == NULL) 3577 return (NULL); 3578 3579 /* 3580 * Lookup policy segment for offset into shared object and return 3581 * policy info 3582 */ 3583 rw_enter(&shm_locality->loc_lock, RW_READER); 3584 policy_info = NULL; 3585 policy_seg = avl_find(tree, &off, &where); 3586 if (policy_seg) 3587 policy_info = &policy_seg->shm_policy; 3588 rw_exit(&shm_locality->loc_lock); 3589 3590 return (policy_info); 3591 } 3592 3593 /* 3594 * Return lgroup to use for allocating memory 3595 * given the segment and address 3596 * 3597 * There isn't any mutual exclusion that exists between calls 3598 * to this routine and DR, so this routine and whomever calls it 3599 * should be mindful of the possibility that the lgrp returned 3600 * may be deleted. If this happens, dereferences of the lgrp 3601 * pointer will still be safe, but the resources in the lgrp will 3602 * be gone, and LGRP_EXISTS() will no longer be true. 3603 */ 3604 lgrp_t * 3605 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz) 3606 { 3607 int i; 3608 lgrp_t *lgrp; 3609 klgrpset_t lgrpset; 3610 int lgrps_spanned; 3611 unsigned long off; 3612 lgrp_mem_policy_t policy; 3613 lgrp_mem_policy_info_t *policy_info; 3614 ushort_t random; 3615 int stat = 0; 3616 3617 /* 3618 * Just return null if the lgrp framework hasn't finished 3619 * initializing or if this is a UMA machine. 3620 */ 3621 if (nlgrps == 1 || !lgrp_initialized) 3622 return (lgrp_root); 3623 3624 /* 3625 * Get memory allocation policy for this segment 3626 */ 3627 policy = lgrp_mem_default_policy; 3628 if (seg != NULL) { 3629 if (seg->s_as == &kas) { 3630 if (policy == LGRP_MEM_POLICY_RANDOM_PROC || 3631 policy == LGRP_MEM_POLICY_RANDOM_PSET) 3632 policy = LGRP_MEM_POLICY_RANDOM; 3633 } else { 3634 policy_info = lgrp_mem_policy_get(seg, vaddr); 3635 if (policy_info != NULL) 3636 policy = policy_info->mem_policy; 3637 } 3638 } 3639 lgrpset = 0; 3640 3641 /* 3642 * Initialize lgroup to home by default 3643 */ 3644 lgrp = lgrp_home_lgrp(); 3645 3646 /* 3647 * When homing threads on root lgrp, override default memory 3648 * allocation policies with root lgroup memory allocation policy 3649 */ 3650 if (lgrp == lgrp_root) 3651 policy = lgrp_mem_policy_root; 3652 3653 /* 3654 * Implement policy 3655 */ 3656 switch (policy) { 3657 case LGRP_MEM_POLICY_NEXT_CPU: 3658 3659 /* 3660 * Return lgroup of current CPU which faulted on memory 3661 * If the CPU isn't currently in an lgrp, then opt to 3662 * allocate from the root. 3663 * 3664 * Kernel preemption needs to be disabled here to prevent 3665 * the current CPU from going away before lgrp is found. 3666 */ 3667 if (LGRP_CPU_HAS_NO_LGRP(CPU)) { 3668 lgrp = lgrp_root; 3669 } else { 3670 kpreempt_disable(); 3671 lgrp = lgrp_cpu_to_lgrp(CPU); 3672 kpreempt_enable(); 3673 } 3674 break; 3675 3676 case LGRP_MEM_POLICY_NEXT: 3677 case LGRP_MEM_POLICY_DEFAULT: 3678 default: 3679 3680 /* 3681 * Just return current thread's home lgroup 3682 * for default policy (next touch) 3683 * If the thread is homed to the root, 3684 * then the default policy is random across lgroups. 3685 * Fallthrough to the random case. 3686 */ 3687 if (lgrp != lgrp_root) { 3688 if (policy == LGRP_MEM_POLICY_NEXT) 3689 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1); 3690 else 3691 lgrp_stat_add(lgrp->lgrp_id, 3692 LGRP_NUM_DEFAULT, 1); 3693 break; 3694 } 3695 /* LINTED fallthrough on case statement */ 3696 case LGRP_MEM_POLICY_RANDOM: 3697 3698 /* 3699 * Return a random leaf lgroup with memory 3700 */ 3701 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3702 /* 3703 * Count how many lgroups are spanned 3704 */ 3705 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3706 3707 /* 3708 * There may be no memnodes in the root lgroup during DR copy 3709 * rename on a system with only two boards (memnodes) 3710 * configured. In this case just return the root lgrp. 3711 */ 3712 if (lgrps_spanned == 0) { 3713 lgrp = lgrp_root; 3714 break; 3715 } 3716 3717 /* 3718 * Pick a random offset within lgroups spanned 3719 * and return lgroup at that offset 3720 */ 3721 random = (ushort_t)gethrtime() >> 4; 3722 off = random % lgrps_spanned; 3723 ASSERT(off <= lgrp_alloc_max); 3724 3725 for (i = 0; i <= lgrp_alloc_max; i++) { 3726 if (!klgrpset_ismember(lgrpset, i)) 3727 continue; 3728 if (off) 3729 off--; 3730 else { 3731 lgrp = lgrp_table[i]; 3732 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3733 1); 3734 break; 3735 } 3736 } 3737 break; 3738 3739 case LGRP_MEM_POLICY_RANDOM_PROC: 3740 3741 /* 3742 * Grab copy of bitmask of lgroups spanned by 3743 * this process 3744 */ 3745 klgrpset_copy(lgrpset, curproc->p_lgrpset); 3746 stat = LGRP_NUM_RANDOM_PROC; 3747 3748 /* LINTED fallthrough on case statement */ 3749 case LGRP_MEM_POLICY_RANDOM_PSET: 3750 3751 if (!stat) 3752 stat = LGRP_NUM_RANDOM_PSET; 3753 3754 if (klgrpset_isempty(lgrpset)) { 3755 /* 3756 * Grab copy of bitmask of lgroups spanned by 3757 * this processor set 3758 */ 3759 kpreempt_disable(); 3760 klgrpset_copy(lgrpset, 3761 curthread->t_cpupart->cp_lgrpset); 3762 kpreempt_enable(); 3763 } 3764 3765 /* 3766 * Count how many lgroups are spanned 3767 */ 3768 klgrpset_nlgrps(lgrpset, lgrps_spanned); 3769 ASSERT(lgrps_spanned <= nlgrps); 3770 3771 /* 3772 * Probably lgrps_spanned should be always non-zero, but to be 3773 * on the safe side we return lgrp_root if it is empty. 3774 */ 3775 if (lgrps_spanned == 0) { 3776 lgrp = lgrp_root; 3777 break; 3778 } 3779 3780 /* 3781 * Pick a random offset within lgroups spanned 3782 * and return lgroup at that offset 3783 */ 3784 random = (ushort_t)gethrtime() >> 4; 3785 off = random % lgrps_spanned; 3786 ASSERT(off <= lgrp_alloc_max); 3787 3788 for (i = 0; i <= lgrp_alloc_max; i++) { 3789 if (!klgrpset_ismember(lgrpset, i)) 3790 continue; 3791 if (off) 3792 off--; 3793 else { 3794 lgrp = lgrp_table[i]; 3795 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM, 3796 1); 3797 break; 3798 } 3799 } 3800 break; 3801 3802 case LGRP_MEM_POLICY_ROUNDROBIN: 3803 3804 /* 3805 * Use offset within segment to determine 3806 * offset from home lgroup to choose for 3807 * next lgroup to allocate memory from 3808 */ 3809 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) % 3810 (lgrp_alloc_max + 1); 3811 3812 kpreempt_disable(); 3813 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM]; 3814 i = lgrp->lgrp_id; 3815 kpreempt_enable(); 3816 3817 while (off > 0) { 3818 i = (i + 1) % (lgrp_alloc_max + 1); 3819 lgrp = lgrp_table[i]; 3820 if (klgrpset_ismember(lgrpset, i)) 3821 off--; 3822 } 3823 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1); 3824 3825 break; 3826 } 3827 3828 ASSERT(lgrp != NULL); 3829 return (lgrp); 3830 } 3831 3832 /* 3833 * Return the number of pages in an lgroup 3834 * 3835 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics 3836 * could cause tests that rely on the numat driver to fail.... 3837 */ 3838 pgcnt_t 3839 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query) 3840 { 3841 lgrp_t *lgrp; 3842 3843 lgrp = lgrp_table[lgrpid]; 3844 if (!LGRP_EXISTS(lgrp) || 3845 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) || 3846 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid)) 3847 return (0); 3848 3849 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query)); 3850 } 3851 3852 /* 3853 * Initialize lgroup shared memory allocation policy support 3854 */ 3855 void 3856 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp) 3857 { 3858 lgrp_shm_locality_t *shm_locality; 3859 3860 /* 3861 * Initialize locality field in anon_map 3862 * Don't need any locks because this is called when anon_map is 3863 * allocated, but not used anywhere yet. 3864 */ 3865 if (amp) { 3866 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3867 if (amp->locality == NULL) { 3868 /* 3869 * Allocate and initialize shared memory locality info 3870 * and set anon_map locality pointer to it 3871 * Drop lock across kmem_alloc(KM_SLEEP) 3872 */ 3873 ANON_LOCK_EXIT(&->a_rwlock); 3874 shm_locality = kmem_alloc(sizeof (*shm_locality), 3875 KM_SLEEP); 3876 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, 3877 NULL); 3878 shm_locality->loc_count = 1; /* not used for amp */ 3879 shm_locality->loc_tree = NULL; 3880 3881 /* 3882 * Reacquire lock and check to see whether anyone beat 3883 * us to initializing the locality info 3884 */ 3885 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 3886 if (amp->locality != NULL) { 3887 rw_destroy(&shm_locality->loc_lock); 3888 kmem_free(shm_locality, 3889 sizeof (*shm_locality)); 3890 } else 3891 amp->locality = shm_locality; 3892 } 3893 ANON_LOCK_EXIT(&->a_rwlock); 3894 return; 3895 } 3896 3897 /* 3898 * Allocate shared vnode policy info if vnode is not locality aware yet 3899 */ 3900 mutex_enter(&vp->v_lock); 3901 if ((vp->v_flag & V_LOCALITY) == 0) { 3902 /* 3903 * Allocate and initialize shared memory locality info 3904 */ 3905 mutex_exit(&vp->v_lock); 3906 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP); 3907 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL); 3908 shm_locality->loc_count = 1; 3909 shm_locality->loc_tree = NULL; 3910 3911 /* 3912 * Point vnode locality field at shared vnode policy info 3913 * and set locality aware flag in vnode 3914 */ 3915 mutex_enter(&vp->v_lock); 3916 if ((vp->v_flag & V_LOCALITY) == 0) { 3917 vp->v_locality = shm_locality; 3918 vp->v_flag |= V_LOCALITY; 3919 } else { 3920 /* 3921 * Lost race so free locality info and increment count. 3922 */ 3923 rw_destroy(&shm_locality->loc_lock); 3924 kmem_free(shm_locality, sizeof (*shm_locality)); 3925 shm_locality = vp->v_locality; 3926 shm_locality->loc_count++; 3927 } 3928 mutex_exit(&vp->v_lock); 3929 3930 return; 3931 } 3932 3933 /* 3934 * Increment reference count of number of segments mapping this vnode 3935 * shared 3936 */ 3937 shm_locality = vp->v_locality; 3938 shm_locality->loc_count++; 3939 mutex_exit(&vp->v_lock); 3940 } 3941 3942 /* 3943 * Destroy the given shared memory policy segment tree 3944 */ 3945 void 3946 lgrp_shm_policy_tree_destroy(avl_tree_t *tree) 3947 { 3948 lgrp_shm_policy_seg_t *cur; 3949 lgrp_shm_policy_seg_t *next; 3950 3951 if (tree == NULL) 3952 return; 3953 3954 cur = (lgrp_shm_policy_seg_t *)avl_first(tree); 3955 while (cur != NULL) { 3956 next = AVL_NEXT(tree, cur); 3957 avl_remove(tree, cur); 3958 kmem_free(cur, sizeof (*cur)); 3959 cur = next; 3960 } 3961 kmem_free(tree, sizeof (avl_tree_t)); 3962 } 3963 3964 /* 3965 * Uninitialize lgroup shared memory allocation policy support 3966 */ 3967 void 3968 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp) 3969 { 3970 lgrp_shm_locality_t *shm_locality; 3971 3972 /* 3973 * For anon_map, deallocate shared memory policy tree and 3974 * zero locality field 3975 * Don't need any locks because anon_map is being freed 3976 */ 3977 if (amp) { 3978 if (amp->locality == NULL) 3979 return; 3980 shm_locality = amp->locality; 3981 shm_locality->loc_count = 0; /* not really used for amp */ 3982 rw_destroy(&shm_locality->loc_lock); 3983 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 3984 kmem_free(shm_locality, sizeof (*shm_locality)); 3985 amp->locality = 0; 3986 return; 3987 } 3988 3989 /* 3990 * For vnode, decrement reference count of segments mapping this vnode 3991 * shared and delete locality info if reference count drops to 0 3992 */ 3993 mutex_enter(&vp->v_lock); 3994 shm_locality = vp->v_locality; 3995 shm_locality->loc_count--; 3996 3997 if (shm_locality->loc_count == 0) { 3998 rw_destroy(&shm_locality->loc_lock); 3999 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree); 4000 kmem_free(shm_locality, sizeof (*shm_locality)); 4001 vp->v_locality = 0; 4002 vp->v_flag &= ~V_LOCALITY; 4003 } 4004 mutex_exit(&vp->v_lock); 4005 } 4006 4007 /* 4008 * Compare two shared memory policy segments 4009 * Used by AVL tree code for searching 4010 */ 4011 int 4012 lgrp_shm_policy_compar(const void *x, const void *y) 4013 { 4014 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x; 4015 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y; 4016 4017 if (a->shm_off < b->shm_off) 4018 return (-1); 4019 if (a->shm_off >= b->shm_off + b->shm_size) 4020 return (1); 4021 return (0); 4022 } 4023 4024 /* 4025 * Concatenate seg1 with seg2 and remove seg2 4026 */ 4027 static int 4028 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1, 4029 lgrp_shm_policy_seg_t *seg2) 4030 { 4031 if (!seg1 || !seg2 || 4032 seg1->shm_off + seg1->shm_size != seg2->shm_off || 4033 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy) 4034 return (-1); 4035 4036 seg1->shm_size += seg2->shm_size; 4037 avl_remove(tree, seg2); 4038 kmem_free(seg2, sizeof (*seg2)); 4039 return (0); 4040 } 4041 4042 /* 4043 * Split segment at given offset and return rightmost (uppermost) segment 4044 * Assumes that there are no overlapping segments 4045 */ 4046 static lgrp_shm_policy_seg_t * 4047 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg, 4048 u_offset_t off) 4049 { 4050 lgrp_shm_policy_seg_t *newseg; 4051 avl_index_t where; 4052 4053 ASSERT(seg != NULL); 4054 ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size); 4055 4056 if (!seg || off < seg->shm_off || off > seg->shm_off + 4057 seg->shm_size) 4058 return (NULL); 4059 4060 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size) 4061 return (seg); 4062 4063 /* 4064 * Adjust size of left segment and allocate new (right) segment 4065 */ 4066 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP); 4067 newseg->shm_policy = seg->shm_policy; 4068 newseg->shm_off = off; 4069 newseg->shm_size = seg->shm_size - (off - seg->shm_off); 4070 seg->shm_size = off - seg->shm_off; 4071 4072 /* 4073 * Find where to insert new segment in AVL tree and insert it 4074 */ 4075 (void) avl_find(tree, &off, &where); 4076 avl_insert(tree, newseg, where); 4077 4078 return (newseg); 4079 } 4080 4081 /* 4082 * Set shared memory allocation policy on specified shared object at given 4083 * offset and length 4084 * 4085 * Return 0 if policy wasn't set already, 1 if policy was set already, and 4086 * -1 if can't set policy. 4087 */ 4088 int 4089 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp, 4090 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len) 4091 { 4092 u_offset_t eoff; 4093 lgrp_shm_policy_seg_t *next; 4094 lgrp_shm_policy_seg_t *newseg; 4095 u_offset_t off; 4096 u_offset_t oldeoff; 4097 lgrp_shm_policy_seg_t *prev; 4098 int retval; 4099 lgrp_shm_policy_seg_t *seg; 4100 lgrp_shm_locality_t *shm_locality; 4101 avl_tree_t *tree; 4102 avl_index_t where; 4103 4104 ASSERT(amp || vp); 4105 ASSERT((len & PAGEOFFSET) == 0); 4106 4107 if (len == 0) 4108 return (-1); 4109 4110 retval = 0; 4111 4112 /* 4113 * Get locality info and starting offset into shared object 4114 * Try anon map first and then vnode 4115 * Assume that no locks need to be held on anon_map or vnode, since 4116 * it should be protected by its reference count which must be nonzero 4117 * for an existing segment. 4118 */ 4119 if (amp) { 4120 /* 4121 * Get policy info from anon_map 4122 * 4123 */ 4124 ASSERT(amp->refcnt != 0); 4125 if (amp->locality == NULL) 4126 lgrp_shm_policy_init(amp, NULL); 4127 shm_locality = amp->locality; 4128 off = ptob(anon_index); 4129 } else if (vp) { 4130 /* 4131 * Get policy info from vnode 4132 */ 4133 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL) 4134 lgrp_shm_policy_init(NULL, vp); 4135 shm_locality = vp->v_locality; 4136 ASSERT(shm_locality->loc_count != 0); 4137 off = vn_off; 4138 } else 4139 return (-1); 4140 4141 ASSERT((off & PAGEOFFSET) == 0); 4142 4143 /* 4144 * Figure out default policy 4145 */ 4146 if (policy == LGRP_MEM_POLICY_DEFAULT) 4147 policy = lgrp_mem_policy_default(len, MAP_SHARED); 4148 4149 /* 4150 * Create AVL tree if there isn't one yet 4151 * and set locality field to point at it 4152 */ 4153 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4154 tree = shm_locality->loc_tree; 4155 if (!tree) { 4156 rw_exit(&shm_locality->loc_lock); 4157 4158 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 4159 4160 rw_enter(&shm_locality->loc_lock, RW_WRITER); 4161 if (shm_locality->loc_tree == NULL) { 4162 avl_create(tree, lgrp_shm_policy_compar, 4163 sizeof (lgrp_shm_policy_seg_t), 4164 offsetof(lgrp_shm_policy_seg_t, shm_tree)); 4165 shm_locality->loc_tree = tree; 4166 } else { 4167 /* 4168 * Another thread managed to set up the tree 4169 * before we could. Free the tree we allocated 4170 * and use the one that's already there. 4171 */ 4172 kmem_free(tree, sizeof (*tree)); 4173 tree = shm_locality->loc_tree; 4174 } 4175 } 4176 4177 /* 4178 * Set policy 4179 * 4180 * Need to maintain hold on writer's lock to keep tree from 4181 * changing out from under us 4182 */ 4183 while (len != 0) { 4184 /* 4185 * Find policy segment for specified offset into shared object 4186 */ 4187 seg = avl_find(tree, &off, &where); 4188 4189 /* 4190 * Didn't find any existing segment that contains specified 4191 * offset, so allocate new segment, insert it, and concatenate 4192 * with adjacent segments if possible 4193 */ 4194 if (seg == NULL) { 4195 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), 4196 KM_SLEEP); 4197 newseg->shm_policy.mem_policy = policy; 4198 newseg->shm_policy.mem_reserved = 0; 4199 newseg->shm_off = off; 4200 avl_insert(tree, newseg, where); 4201 4202 /* 4203 * Check to see whether new segment overlaps with next 4204 * one, set length of new segment accordingly, and 4205 * calculate remaining length and next offset 4206 */ 4207 seg = AVL_NEXT(tree, newseg); 4208 if (seg == NULL || off + len <= seg->shm_off) { 4209 newseg->shm_size = len; 4210 len = 0; 4211 } else { 4212 newseg->shm_size = seg->shm_off - off; 4213 off = seg->shm_off; 4214 len -= newseg->shm_size; 4215 } 4216 4217 /* 4218 * Try to concatenate new segment with next and 4219 * previous ones, since they might have the same policy 4220 * now. Grab previous and next segments first because 4221 * they will change on concatenation. 4222 */ 4223 prev = AVL_PREV(tree, newseg); 4224 next = AVL_NEXT(tree, newseg); 4225 (void) lgrp_shm_policy_concat(tree, newseg, next); 4226 (void) lgrp_shm_policy_concat(tree, prev, newseg); 4227 4228 continue; 4229 } 4230 4231 eoff = off + len; 4232 oldeoff = seg->shm_off + seg->shm_size; 4233 4234 /* 4235 * Policy set already? 4236 */ 4237 if (policy == seg->shm_policy.mem_policy) { 4238 /* 4239 * Nothing left to do if offset and length 4240 * fall within this segment 4241 */ 4242 if (eoff <= oldeoff) { 4243 retval = 1; 4244 break; 4245 } else { 4246 len = eoff - oldeoff; 4247 off = oldeoff; 4248 continue; 4249 } 4250 } 4251 4252 /* 4253 * Specified offset and length match existing segment exactly 4254 */ 4255 if (off == seg->shm_off && len == seg->shm_size) { 4256 /* 4257 * Set policy and update current length 4258 */ 4259 seg->shm_policy.mem_policy = policy; 4260 seg->shm_policy.mem_reserved = 0; 4261 len = 0; 4262 4263 /* 4264 * Try concatenating new segment with previous and next 4265 * segments, since they might have the same policy now. 4266 * Grab previous and next segments first because they 4267 * will change on concatenation. 4268 */ 4269 prev = AVL_PREV(tree, seg); 4270 next = AVL_NEXT(tree, seg); 4271 (void) lgrp_shm_policy_concat(tree, seg, next); 4272 (void) lgrp_shm_policy_concat(tree, prev, seg); 4273 } else { 4274 /* 4275 * Specified offset and length only apply to part of 4276 * existing segment 4277 */ 4278 4279 /* 4280 * New segment starts in middle of old one, so split 4281 * new one off near beginning of old one 4282 */ 4283 newseg = NULL; 4284 if (off > seg->shm_off) { 4285 newseg = lgrp_shm_policy_split(tree, seg, off); 4286 4287 /* 4288 * New segment ends where old one did, so try 4289 * to concatenate with next segment 4290 */ 4291 if (eoff == oldeoff) { 4292 newseg->shm_policy.mem_policy = policy; 4293 newseg->shm_policy.mem_reserved = 0; 4294 (void) lgrp_shm_policy_concat(tree, 4295 newseg, AVL_NEXT(tree, newseg)); 4296 break; 4297 } 4298 } 4299 4300 /* 4301 * New segment ends before old one, so split off end of 4302 * old one 4303 */ 4304 if (eoff < oldeoff) { 4305 if (newseg) { 4306 (void) lgrp_shm_policy_split(tree, 4307 newseg, eoff); 4308 newseg->shm_policy.mem_policy = policy; 4309 newseg->shm_policy.mem_reserved = 0; 4310 } else { 4311 (void) lgrp_shm_policy_split(tree, seg, 4312 eoff); 4313 seg->shm_policy.mem_policy = policy; 4314 seg->shm_policy.mem_reserved = 0; 4315 } 4316 4317 if (off == seg->shm_off) 4318 (void) lgrp_shm_policy_concat(tree, 4319 AVL_PREV(tree, seg), seg); 4320 break; 4321 } 4322 4323 /* 4324 * Calculate remaining length and next offset 4325 */ 4326 len = eoff - oldeoff; 4327 off = oldeoff; 4328 } 4329 } 4330 4331 rw_exit(&shm_locality->loc_lock); 4332 return (retval); 4333 } 4334 4335 /* 4336 * Return the best memnode from which to allocate memory given 4337 * an lgroup. 4338 * 4339 * "c" is for cookie, which is good enough for me. 4340 * It references a cookie struct that should be zero'ed to initialize. 4341 * The cookie should live on the caller's stack. 4342 * 4343 * The routine returns -1 when: 4344 * - traverse is 0, and all the memnodes in "lgrp" have been returned. 4345 * - traverse is 1, and all the memnodes in the system have been 4346 * returned. 4347 */ 4348 int 4349 lgrp_memnode_choose(lgrp_mnode_cookie_t *c) 4350 { 4351 lgrp_t *lp = c->lmc_lgrp; 4352 mnodeset_t nodes = c->lmc_nodes; 4353 int cnt = c->lmc_cnt; 4354 int offset, mnode; 4355 4356 extern int max_mem_nodes; 4357 4358 /* 4359 * If the set is empty, and the caller is willing, traverse 4360 * up the hierarchy until we find a non-empty set. 4361 */ 4362 while (nodes == (mnodeset_t)0 || cnt <= 0) { 4363 if (c->lmc_scope == LGRP_SRCH_LOCAL || 4364 ((lp = lp->lgrp_parent) == NULL)) 4365 return (-1); 4366 4367 nodes = lp->lgrp_mnodes & ~(c->lmc_tried); 4368 cnt = lp->lgrp_nmnodes - c->lmc_ntried; 4369 } 4370 4371 /* 4372 * Select a memnode by picking one at a "random" offset. 4373 * Because of DR, memnodes can come and go at any time. 4374 * This code must be able to cope with the possibility 4375 * that the nodes count "cnt" is inconsistent with respect 4376 * to the number of elements actually in "nodes", and 4377 * therefore that the offset chosen could be greater than 4378 * the number of elements in the set (some memnodes may 4379 * have dissapeared just before cnt was read). 4380 * If this happens, the search simply wraps back to the 4381 * beginning of the set. 4382 */ 4383 ASSERT(nodes != (mnodeset_t)0 && cnt > 0); 4384 offset = c->lmc_rand % cnt; 4385 do { 4386 for (mnode = 0; mnode < max_mem_nodes; mnode++) 4387 if (nodes & ((mnodeset_t)1 << mnode)) 4388 if (!offset--) 4389 break; 4390 } while (mnode >= max_mem_nodes); 4391 4392 /* Found a node. Store state before returning. */ 4393 c->lmc_lgrp = lp; 4394 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode)); 4395 c->lmc_cnt = cnt - 1; 4396 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode)); 4397 c->lmc_ntried++; 4398 4399 return (mnode); 4400 } 4401